Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
# See https://pre-commit.com for more information
repos:
- repo: https://github.com/jedie/cli-base-utilities
rev: v0.29.1
rev: v0.30.0
hooks:
- id: update-readme-history
2 changes: 1 addition & 1 deletion PyHardLinkBackup/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
"""

# See https://packaging.python.org/en/latest/specifications/version-specifiers/
__version__ = '1.8.4'
__version__ = '1.9.0'
__author__ = 'Jens Diemer <PyHardLinkBackup@jensdiemer.de>'
5 changes: 3 additions & 2 deletions PyHardLinkBackup/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ def backup_one_file(
else:
logger.info('Store unique file: %s to %s', src_path, dst_path)
dst_path.write_bytes(file_content)
hash_db[file_hash] = dst_path
backup_result.copied_files += 1
backup_result.copied_size += size

Expand All @@ -146,10 +145,12 @@ def backup_one_file(
else:
logger.info('Copy unique file: %s to %s', src_path, dst_path)
copy_with_progress(src_path, dst_path, progress=progress, total_size=size)
hash_db[file_hash] = dst_path
backup_result.copied_files += 1
backup_result.copied_size += size

# Store new file in hash database or update existing entry to latest backuped file:
hash_db[file_hash] = dst_path

# Keep original file metadata (permission bits, time stamps, and flags)
shutil.copystat(src_path, dst_path)
else:
Expand Down
5 changes: 3 additions & 2 deletions PyHardLinkBackup/cli_app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ def version():


def main(args: Sequence[str] | None = None):
print_version(PyHardLinkBackup)
project_name = 'phlb' # Enforce program name if pipx used
print_version(PyHardLinkBackup, project_name=project_name)
app.cli(
prog='phlb', # Enforce program name if pipx used
prog=project_name,
description=constants.CLI_EPILOG,
use_underscores=False, # use hyphens instead of underscores
sort_subcommands=True,
Expand Down
44 changes: 25 additions & 19 deletions PyHardLinkBackup/tests/test_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def test_happy_path(self):
'wb backups/source/2026-01-01-123456/min_sized_file1.bin',
'w backups/.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
'a backups/source/2026-01-01-123456/SHA256SUMS',
'w backups/.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
'a backups/source/2026-01-01-123456/SHA256SUMS',
'w backups/source/2026-01-01-123456-summary.txt',
],
Expand Down Expand Up @@ -320,7 +321,7 @@ def test_happy_path(self):
assert_hash_db_info(
backup_root=self.backup_root,
expected="""
bb/c4/bbc4de2ca238d1… -> source/2026-01-01-123456/min_sized_file1.bin
bb/c4/bbc4de2ca238d1… -> source/2026-01-01-123456/min_sized_file2.bin
e3/71/e3711d0eacddeb… -> source/2026-01-01-123456/large_file1.bin
""",
)
Expand Down Expand Up @@ -416,15 +417,15 @@ def test_happy_path(self):
redirected_out.stdout,
)

# The FileHashDatabase remains the same:
# The FileHashDatabase always points to the latest backed-up files:
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
assert_hash_db_info(
backup_root=self.backup_root,
expected="""
23/d2/23d2ce40d26211… -> source/2026-01-02-123456/min_sized_file_newA.bin
9a/56/9a567077114134… -> source/2026-01-02-123456/min_sized_file_newB.bin
bb/c4/bbc4de2ca238d1… -> source/2026-01-01-123456/min_sized_file1.bin
e3/71/e3711d0eacddeb… -> source/2026-01-01-123456/large_file1.bin
bb/c4/bbc4de2ca238d1… -> source/2026-01-02-123456/min_sized_file2.bin
e3/71/e3711d0eacddeb… -> source/2026-01-02-123456/large_file2.bin
""",
)

Expand Down Expand Up @@ -463,9 +464,13 @@ def test_happy_path(self):
'a backups/source/2026-01-02-123456/SHA256SUMS',
'wb backups/source/2026-01-02-123456/hardlink2file1',
'a backups/source/2026-01-02-123456/SHA256SUMS',
'w backups/.phlb/hash-lookup/e3/71/e3711d0eacddeb105af4ad9b0d63069d759acf32e49712663419e68dc294a94a',
'a backups/source/2026-01-02-123456/SHA256SUMS',
'w backups/.phlb/hash-lookup/e3/71/e3711d0eacddeb105af4ad9b0d63069d759acf32e49712663419e68dc294a94a',
'a backups/source/2026-01-02-123456/SHA256SUMS',
'w backups/.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
'a backups/source/2026-01-02-123456/SHA256SUMS',
'w backups/.phlb/hash-lookup/bb/c4/bbc4de2ca238d1ec41fb622b75b5cf7d31a6d2ac92405043dd8f8220364fefc8',
'a backups/source/2026-01-02-123456/SHA256SUMS',
'wb backups/source/2026-01-02-123456/min_sized_file_newA.bin',
'w backups/.phlb/hash-lookup/23/d2/23d2ce40d26211a9ffe8096fd1f927f2abd094691839d24f88440f7c5168d500',
Expand Down Expand Up @@ -499,8 +504,8 @@ def test_happy_path(self):
# Don't create broken hardlinks!

"""DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
If a hardlink source from a old backup is missing, we cannot create a hardlink to it.
But it still works to hardlink same files within the current backup.
Deleting files from old backups is safe: the hash DB entry always points to the
most recently backed-up file, so subsequent backups can still create hardlinks.
"""

# Let's remove one of the files used for hardlinking from the first backup:
Expand All @@ -515,8 +520,8 @@ def test_happy_path(self):
self.assertIn('Backup complete', redirected_out.stdout)
backup_dir = result.backup_dir

# Note: min_sized_file1.bin and min_sized_file2.bin are hardlinked,
# but not with the first backup anymore! So it's only nlink=2 now!
# Note: min_sized_file1.bin and min_sized_file2.bin accumulate hardlinks
# because hash_db always points to the latest backup file.
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
assert_fs_tree_overview(
root=backup_dir,
Expand All @@ -527,8 +532,8 @@ def test_happy_path(self):
hardlink2file1 12:00:00 file 1 14 8a11514a
large_file1.bin 12:00:00 hardlink 5 1001 fb3014ff
large_file2.bin 12:00:00 hardlink 5 1001 fb3014ff
min_sized_file1.bin 12:00:00 hardlink 2 1000 f0d93de4
min_sized_file2.bin 12:00:00 hardlink 2 1000 f0d93de4
min_sized_file1.bin 12:00:00 hardlink 5 1000 f0d93de4
min_sized_file2.bin 12:00:00 hardlink 5 1000 f0d93de4
min_sized_file_newA.bin 12:00:00 hardlink 2 1001 a48f0e33
min_sized_file_newB.bin 12:00:00 hardlink 2 1000 7d9c564d
small_file_newA.txt 12:00:00 file 1 10 76d1acf1
Expand All @@ -547,26 +552,26 @@ def test_happy_path(self):
backup_count=12,
backup_size=6091,
symlink_files=1,
hardlinked_files=5,
hardlinked_size=5003,
copied_files=6,
copied_size=1074,
hardlinked_files=6,
hardlinked_size=6003,
copied_files=5,
copied_size=74,
copied_small_files=5,
copied_small_size=74,
error_count=0,
),
)

# Note: min_sized_file1.bin is now from the 2026-01-03 backup!
# All files points now to "2026-01-03" and non of them to the first "2026-01-01" backup:
self.assertEqual(backup_dir.name, '2026-01-03-123456') # Latest backup dir name
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
assert_hash_db_info(
backup_root=self.backup_root,
expected="""
23/d2/23d2ce40d26211… -> source/2026-01-02-123456/min_sized_file_newA.bin
9a/56/9a567077114134… -> source/2026-01-02-123456/min_sized_file_newB.bin
bb/c4/bbc4de2ca238d1… -> source/2026-01-03-123456/min_sized_file1.bin
e3/71/e3711d0eacddeb… -> source/2026-01-01-123456/large_file1.bin
23/d2/23d2ce40d26211… -> source/2026-01-03-123456/min_sized_file_newA.bin
9a/56/9a567077114134… -> source/2026-01-03-123456/min_sized_file_newB.bin
bb/c4/bbc4de2ca238d1… -> source/2026-01-03-123456/min_sized_file2.bin
e3/71/e3711d0eacddeb… -> source/2026-01-03-123456/large_file2.bin
""",
)

Expand Down Expand Up @@ -916,6 +921,7 @@ def test_large_file_handling(self):
[
'w backups/.phlb_test',
'a backups/source/2026-02-22-123456-backup.log',
'w backups/.phlb/hash-lookup/23/d2/23d2ce40d26211a9ffe8096fd1f927f2abd094691839d24f88440f7c5168d500',
'a backups/source/2026-02-22-123456/SHA256SUMS',
'wb backups/source/2026-02-22-123456/large_fileB.txt',
'w backups/.phlb/hash-lookup/2a/92/2a925556d3ec9e4258624a324cd9300a9a3d9c86dac6bbbb63071bdb7787afd2',
Expand Down
2 changes: 1 addition & 1 deletion PyHardLinkBackup/tests/test_project_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_version(self):
assert_is_file(cli_bin)

output = subprocess.check_output([cli_bin, 'version'], text=True)
self.assertIn(f'PyHardLinkBackup v{__version__}', output)
self.assertIn(f'phlb v{__version__}', output)

dev_cli_bin = PACKAGE_ROOT / 'dev-cli.py'
assert_is_file(dev_cli_bin)
Expand Down
6 changes: 0 additions & 6 deletions PyHardLinkBackup/tests/test_readme.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ def test_main_help(self):
),
)

# Installed via pipx is called 'phlb', not 'cli.py':
stdout = stdout.replace('./cli.py', 'phlb')

assert_cli_help_in_readme(text_block=stdout, marker='main help')

def test_backup_help(self):
Expand All @@ -63,9 +60,6 @@ def test_backup_help(self):
),
)

# Installed via pipx is called 'phlb', not 'cli.py':
stdout = stdout.replace('./cli.py', 'phlb')

assert_cli_help_in_readme(text_block=stdout, marker='backup help')

def test_dev_help(self):
Expand Down
13 changes: 3 additions & 10 deletions PyHardLinkBackup/utilities/file_hash_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
logger = logging.getLogger(__name__)


class HashAlreadyExistsError(ValueError):
pass


class FileHashDatabase:
"""DocWrite: README.md ## FileHashDatabase
A simple "database" to store file content hash <-> relative path mappings.
Expand Down Expand Up @@ -54,12 +50,9 @@ def get(self, hash: str) -> Path | None:
return abs_file_path

def __setitem__(self, hash: str, abs_file_path: Path):
"""
Create or update the hash entry with the given absolute file path.
"""
hash_path = self._get_hash_path(hash)
hash_path.parent.mkdir(parents=True, exist_ok=True)

# File should be found before and results in hardlink creation!
# So deny change of existing hashes:
if hash_path.exists():
raise HashAlreadyExistsError(f'Hash {hash} already exists in the database!')

hash_path.write_text(str(abs_file_path.relative_to(self.backup_root)))
34 changes: 27 additions & 7 deletions PyHardLinkBackup/utilities/tests/test_file_hash_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bx_py_utils.test_utils.log_utils import NoLogs
from cli_base.cli_tools.test_utils.base_testcases import BaseTestCase

from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase, HashAlreadyExistsError
from PyHardLinkBackup.utilities.file_hash_database import FileHashDatabase
from PyHardLinkBackup.utilities.filesystem import iter_scandir_files


Expand Down Expand Up @@ -127,10 +127,28 @@ def test_happy_path(self):
)

########################################################################################
# Deny "overwrite" of existing hash:
# Update existing hash to point to a newer file:

with self.assertRaises(HashAlreadyExistsError):
hash_db['12abcd345678abcdef'] = 'foo/bar/baz' # already exists!
"""DocWrite: README.md ## FileHashDatabase
The entry for each hash is always updated to point to the most recently backed-up file.
This means you can safely delete old backups: the hash DB will still point to a valid
file in the most recent backup, so deduplication continues to work correctly.
"""

file_c_path = backup_root_path / 'rel/path/to/file-C'
file_c_path.parent.mkdir(parents=True, exist_ok=True)
file_c_path.touch()

hash_db['12abcd345678abcdef'] = file_c_path
self.assertEqual(hash_db.get('12abcd345678abcdef'), file_c_path)
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
assert_hash_db_info(
backup_root=hash_db.backup_root,
expected="""
12/34/12345678abcdef… -> rel/path/to/file-A
12/ab/12abcd345678ab… -> rel/path/to/file-C
""",
)

########################################################################################
# Don't use stale entries pointing to missing files:
Expand All @@ -139,15 +157,17 @@ def test_happy_path(self):
file_a_path.unlink()

"""DocWrite: README.md ## FileHashDatabase - Missing hardlink target file
We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
A warning is logged in this case."""
The `get()` method checks whether the referenced file still exists.
If not, the stale entry is removed and a warning is logged.
On the next backup run, the file is then copied fresh instead of hardlinked.
"""
with self.assertLogs(level=logging.WARNING) as logs:
self.assertIs(hash_db.get('12345678abcdef'), None)
self.assertIn('Hash database entry found, but file does not exist', ''.join(logs.output))
with self.assertLogs('PyHardLinkBackup', level=logging.DEBUG):
assert_hash_db_info(
backup_root=hash_db.backup_root,
expected="""
12/ab/12abcd345678ab… -> rel/path/to/file-B
12/ab/12abcd345678ab… -> rel/path/to/file-C
""",
)
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,9 @@ Overview of main changes:

[comment]: <> (✂✂✂ auto generated history start ✂✂✂)

* [v1.9.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.8.4...v1.9.0)
* 2026-04-14 - Update existing links in has database
* 2026-04-14 - Update requirements
* [v1.8.4](https://github.com/jedie/PyHardLinkBackup/compare/v1.8.3...v1.8.4)
* 2026-04-09 - Update requirements
* 2026-04-09 - Apply project updates
Expand All @@ -295,6 +298,9 @@ Overview of main changes:
* 2026-03-28 - Update requirements
* 2026-03-28 - apply manageprojects updates
* 2026-03-25 - fix some code styles

<details><summary>Expand older history entries ...</summary>

* [v1.8.1](https://github.com/jedie/PyHardLinkBackup/compare/v1.8.0...v1.8.1)
* 2026-01-24 - Update packaging commands related to new direct "uv" usage
* 2026-01-24 - Bugfix "rebuild" command
Expand All @@ -304,9 +310,6 @@ Overview of main changes:
* 2026-01-22 - rebuid command: skip hashing same files by check the inode uniqueness
* 2026-01-22 - Add "fs-info" in dev cli
* 2026-01-22 - rebuild command: fix wrong progress bar

<details><summary>Expand older history entries ...</summary>

* [v1.8.0](https://github.com/jedie/PyHardLinkBackup/compare/v1.7.3...v1.8.0)
* 2026-01-22 - Add optional "--name" to enforce a name for the backup sub directory
* 2026-01-22 - Do not cross filesystem boundaries as default
Expand Down
13 changes: 9 additions & 4 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,18 @@ Notes:
* The "relative path" that will be stored is not validated, so it can be any string.
* We don't "cache" anything in Memory, to avoid high memory consumption for large datasets.

The entry for each hash is always updated to point to the most recently backed-up file.
This means you can safely delete old backups: the hash DB will still point to a valid
file in the most recent backup, so deduplication continues to work correctly.

## FileHashDatabase - Missing hardlink target file

If a hardlink source from a old backup is missing, we cannot create a hardlink to it.
But it still works to hardlink same files within the current backup.
Deleting files from old backups is safe: the hash DB entry always points to the
most recently backed-up file, so subsequent backups can still create hardlinks.

We check if the hardlink source file still exists. If not, we remove the hash entry from the database.
A warning is logged in this case.
The `get()` method checks whether the referenced file still exists.
If not, the stale entry is removed and a warning is logged.
On the next backup run, the file is then copied fresh instead of hardlinked.

## FileSizeDatabase

Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ authors = [
]
requires-python = ">=3.12"
dependencies = [
"cli-base-utilities>=0.27.1", # https://github.com/jedie/cli-base-utilities
"cli-base-utilities", # https://github.com/jedie/cli-base-utilities
"bx_py_utils", # https://github.com/boxine/bx_py_utils
"tyro", # https://github.com/brentyi/tyro
"rich", # https://github.com/Textualize/rich
Expand Down Expand Up @@ -43,8 +43,9 @@ exclude-newer = "1 week"
[tool.uv.exclude-newer-package]
# Exclude own packages from the "exclude-newer" rule and
# add external packages temporarily to fix known issues or current CVEs
uv = "2026-04-13T12:00:00Z"
cli-base-utilities = "2026-04-13T12:00:00Z"
cryptography = "2026-04-08T12:00:00Z"
django = "2026-04-08T12:00:00Z"


[tool.cli_base.pip_audit]
Expand Down
Loading
Loading