Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions dvc/istextfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,11 @@
TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b"


def istextfile(fname, blocksize=512):
""" Uses heuristics to guess whether the given file is text or binary,
by reading a single block of bytes from the file.
If more than 30% of the chars in the block are non-text, or there
are NUL ('\x00') bytes in the block, assume this is a binary file.
def istext(block):
""" Uses heuristics to guess whether the given block of bytes is text or
binary. If more than 30% of the chars in the block are non-text, or there
are NUL ('\x00') bytes in the block, assume this is a binary file.
"""
with open(fname, "rb") as fobj:
block = fobj.read(blocksize)

if not block:
# An empty file is considered a valid text file
return True
Expand All @@ -28,3 +24,11 @@ def istextfile(fname, blocksize=512):
# occurrences of TEXT_CHARS from the block
nontext = block.translate(None, TEXT_CHARS)
return float(len(nontext)) / len(block) <= 0.30


def istextfile(fname, blocksize=2048):
""" Uses heuristics on the first 'blocksize' bytes in a file to guess
whether the given file is text or binary.
"""
with open(fname, "rb") as fobj:
return istext(fobj.read(blocksize))
9 changes: 6 additions & 3 deletions dvc/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ def dos2unix(data):
def file_md5(fname):
""" get the (md5 hexdigest, md5 digest) of a file """
from dvc.progress import Tqdm
from dvc.istextfile import istextfile
from dvc.istextfile import istext

fname = fspath_py35(fname)

if os.path.exists(fname):
hash_md5 = hashlib.md5()
binary = not istextfile(fname)
is_file_binary = None
size = os.path.getsize(fname)
no_progress_bar = True
if size >= LARGE_FILE_SIZE:
Expand All @@ -62,7 +62,10 @@ def file_md5(fname):
if not data:
break

if binary:
if is_file_binary is None:
is_file_binary = not istext(data)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This basically kiils the heuristics and starts checking the whole file contents. I suppose we could extend the heuristics to "check first N bytes + check last N bytes" to cover your particular case, but there is no real need for it, as the main intention for this heuristics is providing md5 compatibility for git-tracked files between *nix and windows, where git uses CRLF by default even if you didn't have them before in your file. So we actually need to have a heuristic that is an exact match to the one that git uses. πŸ™‚

That being said, we sure could look into optimizing this by not opening the file again or optimizing dos2unix() etc.

Copy link
Copy Markdown
Author

@kevin-hanselman kevin-hanselman Jan 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW: This code still only checks the first N bytes. Where N was 512 bytes before, now it's the first MB. (Note how this if statement only gets entered once -- if is_file_binary has it's initial non-bool value of None.)

Copy link
Copy Markdown
Author

@kevin-hanselman kevin-hanselman Jan 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


if is_file_binary:
chunk = data
else:
chunk = dos2unix(data)
Expand Down