diff --git a/dvc/istextfile.py b/dvc/istextfile.py index 2e10592d9c..fc2bc516f3 100644 --- a/dvc/istextfile.py +++ b/dvc/istextfile.py @@ -7,15 +7,11 @@ TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b" -def istextfile(fname, blocksize=512): - """ Uses heuristics to guess whether the given file is text or binary, - by reading a single block of bytes from the file. - If more than 30% of the chars in the block are non-text, or there - are NUL ('\x00') bytes in the block, assume this is a binary file. +def istext(block): + """ Uses heuristics to guess whether the given block of bytes is text or + binary. If more than 30% of the chars in the block are non-text, or there + are NUL ('\x00') bytes in the block, assume this is a binary file. """ - with open(fname, "rb") as fobj: - block = fobj.read(blocksize) - if not block: # An empty file is considered a valid text file return True @@ -28,3 +24,11 @@ def istextfile(fname, blocksize=512): # occurrences of TEXT_CHARS from the block nontext = block.translate(None, TEXT_CHARS) return float(len(nontext)) / len(block) <= 0.30 + + +def istextfile(fname, blocksize=2048): + """ Uses heuristics on the first 'blocksize' bytes in a file to guess + whether the given file is text or binary. + """ + with open(fname, "rb") as fobj: + return istext(fobj.read(blocksize)) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index bf02ecdb24..0011770435 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -32,13 +32,13 @@ def dos2unix(data): def file_md5(fname): """ get the (md5 hexdigest, md5 digest) of a file """ from dvc.progress import Tqdm - from dvc.istextfile import istextfile + from dvc.istextfile import istext fname = fspath_py35(fname) if os.path.exists(fname): hash_md5 = hashlib.md5() - binary = not istextfile(fname) + is_file_binary = None size = os.path.getsize(fname) no_progress_bar = True if size >= LARGE_FILE_SIZE: @@ -62,7 +62,10 @@ def file_md5(fname): if not data: break - if binary: + if is_file_binary is None: + is_file_binary = not istext(data) + + if is_file_binary: chunk = data else: chunk = dos2unix(data)