From f9cefef70914f1ba267eb4b09d930033ddfa1ada Mon Sep 17 00:00:00 2001 From: badrogger Date: Mon, 14 Oct 2024 18:33:40 +0000 Subject: [PATCH 1/6] Add sync-node repair command --- node_cli/cli/sync_node.py | 43 +++++++++++++++++++++++- node_cli/core/node.py | 22 ++++++++++++ node_cli/core/schains.py | 59 +++++++++++++++++++++++++++++---- node_cli/operations/__init__.py | 3 +- node_cli/operations/base.py | 31 +++++++++++++++-- node_cli/utils/docker_utils.py | 50 ++++++++++++++++++++++++++++ 6 files changed, 198 insertions(+), 10 deletions(-) diff --git a/node_cli/cli/sync_node.py b/node_cli/cli/sync_node.py index b6247c1a..304a20d5 100644 --- a/node_cli/cli/sync_node.py +++ b/node_cli/cli/sync_node.py @@ -21,7 +21,7 @@ import click -from node_cli.core.node import init_sync, update_sync +from node_cli.core.node import init_sync, update_sync, repair_sync from node_cli.utils.helper import ( abort_if_false, error_exit, @@ -95,3 +95,44 @@ def _init_sync(env_file, archive, catchup, historic_state, snapshot_from: Option @streamed_cmd def _update_sync(env_file, unsafe_ok): update_sync(env_file) + + +@sync_node.command('repair', help='Start sync node from empty database') +@click.option('--yes', is_flag=True, callback=abort_if_false, + expose_value=False, + prompt='Are you sure you want to start sync node from empty database?') +@click.option( + '--archive', + help=TEXTS['init']['archive'], + is_flag=True +) +@click.option( + '--catchup', + help=TEXTS['init']['catchup'], + is_flag=True +) +@click.option( + '--historic-state', + help=TEXTS['init']['historic_state'], + is_flag=True +) +@click.option( + '--snapshot-from', + type=IP_TYPE, + default=None, + hidden=True, + help='Ip of the node from to download snapshot from' +) +@streamed_cmd +def _repair_sync( + archive: str, + catchup: str, + historic_state: str, + snapshot_from: Optional[str] = None +) -> None: + repair_sync( + archive=archive, + catchup=catchup, + historic_state=historic_state, + snapshot_from=snapshot_from + ) diff --git a/node_cli/core/node.py b/node_cli/core/node.py index da06a11d..bfd314b7 100644 --- a/node_cli/core/node.py +++ b/node_cli/core/node.py @@ -58,6 +58,7 @@ turn_on_op, restore_op, init_sync_op, + repair_sync_op, update_sync_op ) from node_cli.utils.print_formatters import ( @@ -234,6 +235,27 @@ def update_sync(env_filepath: str, unsafe_ok: bool = False) -> None: logger.info('Node update finished') +@check_inited +@check_user +def repair_sync( + archive: bool, + catchup: bool, + historic_state: bool, + snapshot_from: str +) -> None: + + env_params = extract_env_params(INIT_ENV_FILEPATH, sync_node=True) + schain_name = env_params['SCHAIN_NAME'] + repair_sync_op( + schain_name=schain_name, + archive=archive, + catchup=catchup, + historic_state=historic_state, + snapshot_from=snapshot_from + ) + logger.info('Schain was started from scratch') + + def get_node_env( env_filepath, inited_node=False, diff --git a/node_cli/core/schains.py b/node_cli/core/schains.py index 2d40677e..a27b41fc 100644 --- a/node_cli/core/schains.py +++ b/node_cli/core/schains.py @@ -1,3 +1,4 @@ +import glob import logging import os import pprint @@ -11,7 +12,8 @@ ALLOCATION_FILEPATH, NODE_CONFIG_PATH, NODE_CLI_STATUS_FILENAME, - SCHAIN_NODE_DATA_PATH + SCHAIN_NODE_DATA_PATH, + SCHAINS_MNT_DIR_SYNC ) from node_cli.configs.env import get_env_config @@ -94,20 +96,37 @@ def get_node_cli_schain_status_filepath(schain_name: str) -> str: return os.path.join(SCHAIN_NODE_DATA_PATH, schain_name, NODE_CLI_STATUS_FILENAME) -def update_node_cli_schain_status(schain_name: str, status: dict) -> None: +def update_node_cli_schain_status( + schain_name: str, + repair_ts: Optional[int] = None, + snapshot_from: Optional[str] = None +) -> None: path = get_node_cli_schain_status_filepath(schain_name) - os.makedirs(os.path.dirname(path), exist_ok=True) + if os.path.isdir(path): + orig_status = get_node_cli_schain_status(schain_name=schain_name) + orig_status.update({'repair_ts': repair_ts, 'snapshot_from': snapshot_from}) + status = orig_status + else: + status = { + 'schain_name': schain_name, + 'repair_ts': repair_ts, + 'snapshot_from': snapshot_from + } + os.makedirs(os.path.dirname(path), exist_ok=True) save_json(path, status) +def get_node_cli_schain_status(schain_name: str) -> dict: + path = get_node_cli_schain_status_filepath(schain_name) + return read_json(path) + + def toggle_schain_repair_mode( schain: str, snapshot_from: Optional[str] = None ) -> None: ts = int(time.time()) - status = {'schain_name': schain, 'repair_ts': ts} - status.update({'snapshot_from': snapshot_from}) - update_node_cli_schain_status(schain, status) + update_node_cli_schain_status(schain_name=schain, repair_ts=ts, snapshot_from=snapshot_from) print('Schain has been set for repair') @@ -168,6 +187,10 @@ def make_btrfs_snapshot(src: str, dst: str) -> None: run_cmd(['btrfs', 'subvolume', 'snapshot', src, dst]) +def rm_btrfs_subvolume(subvolume: str) -> None: + run_cmd(['btrfs', 'subvolume', 'delete', subvolume]) + + def fillin_snapshot_folder(src_path: str, block_number: int) -> None: snapshots_dirname = 'snapshots' snapshot_folder_path = os.path.join( @@ -224,3 +247,27 @@ def ensure_schain_volume(schain: str, schain_type: str, env_type: str) -> None: ensure_volume(schain, size) else: logger.warning('Volume %s already exists', schain) + + +def cleanup_sync_datadir(schain_name: str) -> None: + base_path = os.path.join(SCHAINS_MNT_DIR_SYNC, schain_name) + regular_folders_pattern = f'{base_path}/[!snapshots]*' + logger.info('Removing regular folders') + for filepath in glob.glob(regular_folders_pattern): + if os.path.isdir(filepath): + logger.debug('Removing recursively %s', filepath) + shutil.rmtree(filepath) + if os.path.isfile(filepath): + os.remove(filepath) + + logger.info('Removing subvolumes') + subvolumes_pattern = f'{base_path}/snapshots/*/*' + for filepath in glob.glob(subvolumes_pattern): + logger.debug('Deleting subvolume %s', filepath) + if os.path.isdir(filepath): + rm_btrfs_subvolume(filepath) + else: + os.remove(filepath) + logger.info('Cleaning up snapshots folder') + if os.path.isdir(base_path): + shutil.rmtree(base_path) diff --git a/node_cli/operations/__init__.py b/node_cli/operations/__init__.py index 11d3dd4d..ca1b076d 100644 --- a/node_cli/operations/__init__.py +++ b/node_cli/operations/__init__.py @@ -24,5 +24,6 @@ update_sync as update_sync_op, turn_off as turn_off_op, turn_on as turn_on_op, - restore as restore_op + restore as restore_op, + repair_sync as repair_sync_op ) diff --git a/node_cli/operations/base.py b/node_cli/operations/base.py index cf4a9eb1..01c3d616 100644 --- a/node_cli/operations/base.py +++ b/node_cli/operations/base.py @@ -47,12 +47,15 @@ from node_cli.operations.skale_node import download_skale_node, sync_skale_node, update_images from node_cli.core.checks import CheckType, run_checks as run_host_checks from node_cli.core.iptables import configure_iptables -from node_cli.core.schains import update_node_cli_schain_status +from node_cli.core.schains import update_node_cli_schain_status, cleanup_sync_datadir from node_cli.utils.docker_utils import ( compose_rm, compose_up, docker_cleanup, - remove_dynamic_containers + remove_dynamic_containers, + remove_schain_container, + start_admin, + stop_admin ) from node_cli.utils.meta import get_meta_info, update_meta from node_cli.utils.print_formatters import print_failed_requirements_checks @@ -344,3 +347,27 @@ def restore(env, backup_path, config_only=False): print_failed_requirements_checks(failed_checks) return False return True + + +def repair_sync( + schain_name: str, + archive: bool, + catchup: bool, + historic_state: bool, + snapshot_from: Optional[str] +) -> None: + stop_admin(sync_node=True) + remove_schain_container(schain_name=schain_name) + + logger.info('Updating node options') + cleanup_sync_datadir(schain_name=schain_name) + + logger.info('Updating node options') + node_options = NodeOptions() + node_options.archive = archive + node_options.catchup = catchup + node_options.historic_state = historic_state + + logger.info('Updating cli status') + update_node_cli_schain_status(schain_name, snapshot_from=snapshot_from) + start_admin(sync_node=True) diff --git a/node_cli/utils/docker_utils.py b/node_cli/utils/docker_utils.py index b48a3306..2f5e56a8 100644 --- a/node_cli/utils/docker_utils.py +++ b/node_cli/utils/docker_utils.py @@ -21,6 +21,7 @@ import itertools import os import logging +from typing import Optional import docker from docker.client import DockerClient @@ -39,6 +40,7 @@ logger = logging.getLogger(__name__) +ADMIN_REMOVE_TIMEOUT = 60 SCHAIN_REMOVE_TIMEOUT = 300 IMA_REMOVE_TIMEOUT = 20 TELEGRAF_REMOVE_TIMEOUT = 20 @@ -131,6 +133,54 @@ def safe_rm(container: Container, timeout=DOCKER_DEFAULT_STOP_TIMEOUT, **kwargs) logger.info(f'Container removed: {container_name}') +def stop_container( + container_name: str, + timeout: int = DOCKER_DEFAULT_STOP_TIMEOUT, + dclient: Optional[DockerClient] = None +) -> None: + dc = dclient or docker_client() + container = dc.containers.get(container_name) + logger.info('Stopping container: %s, timeout: %s', container_name, timeout) + container.stop(timeout=timeout) + + +def rm_container( + container_name: str, + timeout: int = DOCKER_DEFAULT_STOP_TIMEOUT, + dclient: Optional[DockerClient] = None +) -> None: + dc = dclient or docker_client() + container_names = [container.name for container in get_containers()] + if container_name in container_names: + container = dc.containers.get(container_name) + safe_rm(container) + + +def start_container( + container_name: str, + dclient: Optional[DockerClient] = None +) -> None: + dc = dclient or docker_client() + container = dc.containers.get(container_name) + logger.info('Starting container %s', container_name) + container.start() + + +def start_admin(sync_node: bool = False, dclient: Optional[DockerClient] = None) -> None: + container_name = 'skale_sync_admin' if sync_node else 'skale_admin' + start_container(container_name=container_name, dclient=dclient) + + +def stop_admin(sync_node: bool = False, dclient: Optional[DockerClient] = None) -> None: + container_name = 'skale_sync_admin' if sync_node else 'skale_admin' + stop_container(container_name=container_name, timeout=ADMIN_REMOVE_TIMEOUT, dclient=dclient) + + +def remove_schain_container(schain_name: str, dclient: Optional[DockerClient] = None) -> None: + container_name = f'skale_schain_{schain_name}' + rm_container(container_name, timeout=SCHAIN_REMOVE_TIMEOUT, dclient=dclient) + + def backup_container_logs( container: Container, head: int = DOCKER_DEFAULT_HEAD_LINES, From f044833ff8499f31b686efff89f6c7d70f9535b1 Mon Sep 17 00:00:00 2001 From: badrogger Date: Mon, 14 Oct 2024 19:26:51 +0000 Subject: [PATCH 2/6] Add cleanup_sync_datadir test --- node_cli/core/schains.py | 5 +-- tests/conftest.py | 12 ++++++- tests/core/core_schains_test.py | 61 +++++++++++++++++++++++++++++++-- tests/helper.py | 2 ++ 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/node_cli/core/schains.py b/node_cli/core/schains.py index a27b41fc..0d888835 100644 --- a/node_cli/core/schains.py +++ b/node_cli/core/schains.py @@ -249,8 +249,9 @@ def ensure_schain_volume(schain: str, schain_type: str, env_type: str) -> None: logger.warning('Volume %s already exists', schain) -def cleanup_sync_datadir(schain_name: str) -> None: - base_path = os.path.join(SCHAINS_MNT_DIR_SYNC, schain_name) +def cleanup_sync_datadir(schain_name: str, base_path: str = SCHAINS_MNT_DIR_SYNC) -> None: + base_path = os.path.join(base_path, schain_name) + print('HERE', base_path) regular_folders_pattern = f'{base_path}/[!snapshots]*' logger.info('Removing regular folders') for filepath in glob.glob(regular_folders_pattern): diff --git a/tests/conftest.py b/tests/conftest.py index 93d82521..677b27e6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,7 +44,7 @@ from node_cli.utils.docker_utils import docker_client from node_cli.utils.global_config import generate_g_config_file -from tests.helper import TEST_META_V1, TEST_META_V2, TEST_META_V3 +from tests.helper import TEST_META_V1, TEST_META_V2, TEST_META_V3, TEST_SCHAINS_MNT_DIR_SYNC TEST_ENV_PARAMS = """ @@ -312,3 +312,13 @@ def tmp_schains_dir(): yield SCHAIN_NODE_DATA_PATH finally: shutil.rmtree(SCHAIN_NODE_DATA_PATH) + + +@pytest.fixture +def tmp_sync_datadir(): + + os.makedirs(TEST_SCHAINS_MNT_DIR_SYNC) + try: + yield TEST_SCHAINS_MNT_DIR_SYNC + finally: + shutil.rmtree(TEST_SCHAINS_MNT_DIR_SYNC) diff --git a/tests/core/core_schains_test.py b/tests/core/core_schains_test.py index 0c27a7ad..1681ce20 100644 --- a/tests/core/core_schains_test.py +++ b/tests/core/core_schains_test.py @@ -1,9 +1,12 @@ import os import datetime +from unittest import mock +from pathlib import Path + import freezegun -from node_cli.core.schains import toggle_schain_repair_mode +from node_cli.core.schains import cleanup_sync_datadir, toggle_schain_repair_mode from node_cli.utils.helper import read_json @@ -13,11 +16,11 @@ @freezegun.freeze_time(CURRENT_DATETIME) def test_toggle_repair_mode(tmp_schains_dir): - schain_name = "test_schain" + schain_name = 'test_schain' schain_folder = os.path.join(tmp_schains_dir, schain_name) os.mkdir(schain_folder) toggle_schain_repair_mode(schain_name) - schain_status_path = os.path.join(schain_folder, "node_cli.status") + schain_status_path = os.path.join(schain_folder, 'node_cli.status') assert os.path.isfile(schain_status_path) assert read_json(schain_status_path) == { @@ -33,3 +36,55 @@ def test_toggle_repair_mode(tmp_schains_dir): 'schain_name': 'test_schain', 'snapshot_from': '127.0.0.1', } + + +@freezegun.freeze_time(CURRENT_DATETIME) +def test_cleanup_sync_datadir(tmp_sync_datadir): + schain_name = 'test_schain' + base_folder = Path(tmp_sync_datadir).joinpath(schain_name) + base_folder.mkdir() + folders = [ + '28e07f34', + 'block_sigshares_0.db', + 'da_proofs_0.db', + 'filestorage', + 'incoming_msgs_0.db', + 'proposal_hashes_0.db', + 'snapshots', + 'blocks_0.db', + 'da_sigshares_0.db', + 'historic_roots', + 'internal_info_0.db', + 'outgoing_msgs_0.db', + 'proposal_vectors_0.db', + 'block_proposals_0.db', + 'consensus_state_0.db', + 'diffs', + 'historic_state', + 'prices_0.db', + 'randoms_0.db', + ] + regular_files = ['HEALTH_CHECK', 'keys.info', 'keys.info.salt'] + snapshots = ['0', '100', '111'] + snapshot_content = ['28e07f34', 'blocks_0.db', 'filestorage', 'prices_0.db'] + + for folder_name in folders: + path = base_folder.joinpath(folder_name) + path.mkdir() + + for file_name in regular_files: + path = base_folder.joinpath(file_name) + path.touch() + + for snapshot_block in snapshots: + snapshot_folder = base_folder.joinpath('snapshots', snapshot_block) + snapshot_folder.mkdir() + for folder in snapshot_content: + content_path = snapshot_folder.joinpath(folder) + content_path.mkdir() + hash_path = snapshot_folder.joinpath('snapshot_hash.txt') + hash_path.touch() + + with mock.patch('node_cli.core.schains.rm_btrfs_subvolume'): + cleanup_sync_datadir(schain_name, base_path=tmp_sync_datadir) + assert not os.path.isdir(base_folder) diff --git a/tests/helper.py b/tests/helper.py index c753e176..805fcf51 100644 --- a/tests/helper.py +++ b/tests/helper.py @@ -27,6 +27,8 @@ BLOCK_DEVICE = os.getenv('BLOCK_DEVICE') +TEST_SCHAINS_MNT_DIR_SYNC = 'tests/tmp' + TEST_META_V1 = { 'version': '0.1.1', 'config_stream': 'develop' From 9cd26a7ef5589f34a111051d58f364e6bc505662 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 15 Oct 2024 12:59:34 +0000 Subject: [PATCH 3/6] Add repair_sync test --- node_cli/core/schains.py | 1 - tests/core/core_node_test.py | 13 +++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/node_cli/core/schains.py b/node_cli/core/schains.py index 0d888835..ab646b8f 100644 --- a/node_cli/core/schains.py +++ b/node_cli/core/schains.py @@ -251,7 +251,6 @@ def ensure_schain_volume(schain: str, schain_type: str, env_type: str) -> None: def cleanup_sync_datadir(schain_name: str, base_path: str = SCHAINS_MNT_DIR_SYNC) -> None: base_path = os.path.join(base_path, schain_name) - print('HERE', base_path) regular_folders_pattern = f'{base_path}/[!snapshots]*' logger.info('Removing regular folders') for filepath in glob.glob(regular_folders_pattern): diff --git a/tests/core/core_node_test.py b/tests/core/core_node_test.py index c3c3e11d..5bc02f97 100644 --- a/tests/core/core_node_test.py +++ b/tests/core/core_node_test.py @@ -12,7 +12,7 @@ from node_cli.configs import NODE_DATA_PATH from node_cli.configs.resource_allocation import RESOURCE_ALLOCATION_FILEPATH from node_cli.core.node import BASE_CONTAINERS_AMOUNT, is_base_containers_alive -from node_cli.core.node import init, pack_dir, update, is_update_safe +from node_cli.core.node import init, pack_dir, update, is_update_safe, repair_sync from tests.helper import response_mock, safe_update_api_response, subprocess_run_mock from tests.resources_test import BIG_DISK_SIZE @@ -169,7 +169,9 @@ def test_update_node(mocked_g_config, resource_file): ), mock.patch('node_cli.core.resources.get_disk_size', return_value=BIG_DISK_SIZE), mock.patch( 'node_cli.core.host.init_data_dir' ): - with mock.patch('node_cli.utils.helper.requests.get', return_value=safe_update_api_response()): # noqa + with mock.patch( + 'node_cli.utils.helper.requests.get', return_value=safe_update_api_response() + ): # noqa result = update(env_filepath, pull_config_for_schain=None) assert result is None @@ -183,3 +185,10 @@ def test_is_update_safe(): 'node_cli.utils.helper.requests.get', return_value=safe_update_api_response(safe=False) ): assert not is_update_safe() + + +def test_repair_sync(tmp_sync_datadir, mocked_g_config, resource_file): + with mock.patch('node_cli.core.schains.rm_btrfs_subvolume'), \ + mock.patch('node_cli.utils.docker_utils.stop_container'), \ + mock.patch('node_cli.utils.docker_utils.start_container'): + repair_sync(archive=True, catchup=True, historic_state=True, snapshot_from='127.0.0.1') From c8611e9b57db7761735151436137b11d719f0f8a Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 15 Oct 2024 13:00:44 +0000 Subject: [PATCH 4/6] Fix linter --- tests/core/core_node_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/core_node_test.py b/tests/core/core_node_test.py index 5bc02f97..01f42867 100644 --- a/tests/core/core_node_test.py +++ b/tests/core/core_node_test.py @@ -190,5 +190,5 @@ def test_is_update_safe(): def test_repair_sync(tmp_sync_datadir, mocked_g_config, resource_file): with mock.patch('node_cli.core.schains.rm_btrfs_subvolume'), \ mock.patch('node_cli.utils.docker_utils.stop_container'), \ - mock.patch('node_cli.utils.docker_utils.start_container'): + mock.patch('node_cli.utils.docker_utils.start_container'): repair_sync(archive=True, catchup=True, historic_state=True, snapshot_from='127.0.0.1') From 363b8cd7e07bf8abe098b9c634b15e59f76cb830 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 15 Oct 2024 14:47:08 +0000 Subject: [PATCH 5/6] Fix tests --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 677b27e6..54308b49 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -317,7 +317,7 @@ def tmp_schains_dir(): @pytest.fixture def tmp_sync_datadir(): - os.makedirs(TEST_SCHAINS_MNT_DIR_SYNC) + os.makedirs(TEST_SCHAINS_MNT_DIR_SYNC, exist_ok=True) try: yield TEST_SCHAINS_MNT_DIR_SYNC finally: From 110b4988d40ed7c3cabcb471ff2421a6e8c97580 Mon Sep 17 00:00:00 2001 From: badrogger Date: Tue, 15 Oct 2024 15:01:33 +0000 Subject: [PATCH 6/6] Fix tests --- tests/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 54308b49..824ba93d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -307,7 +307,7 @@ def tmp_config_dir(): @pytest.fixture def tmp_schains_dir(): - os.makedirs(SCHAIN_NODE_DATA_PATH) + os.makedirs(SCHAIN_NODE_DATA_PATH, exist_ok=True) try: yield SCHAIN_NODE_DATA_PATH finally: @@ -316,7 +316,6 @@ def tmp_schains_dir(): @pytest.fixture def tmp_sync_datadir(): - os.makedirs(TEST_SCHAINS_MNT_DIR_SYNC, exist_ok=True) try: yield TEST_SCHAINS_MNT_DIR_SYNC