From 1b3ecbc99a10d47ab0ba773a8c5610f10a99018a Mon Sep 17 00:00:00 2001 From: kant Date: Mon, 18 Aug 2025 12:44:08 -0700 Subject: [PATCH] extracting world state in pages --- tools/state-dump/README.md | 16 +- tools/state-dump/world_state.py | 336 ++++++++++++++++++++++---------- 2 files changed, 247 insertions(+), 105 deletions(-) diff --git a/tools/state-dump/README.md b/tools/state-dump/README.md index 509bd6c77..e9ecdc51b 100644 --- a/tools/state-dump/README.md +++ b/tools/state-dump/README.md @@ -38,16 +38,26 @@ pip install -r requirements.txt python3 world_state.py \ --rpc http://:8545 \ --input-genesis initial_genesis.json \ - --output out_genesis.json + --output out_genesis.json \ + --include-code \ + --include-storage \ + --page-size 2048 \ + --min-page-size 256 \ + --rpc-timeout 300 ``` **Example:** ```bash -python3 world_state.py \ +python3 world_state3.py \ --rpc http://34.75.194.46:8545 \ --input-genesis initial_genesis.json \ - --output out_genesis.json + --output out_genesis.json \ + --include-code \ + --include-storage \ + --page-size 2048 \ + --min-page-size 256 \ + --rpc-timeout 300 ``` - `--rpc` diff --git a/tools/state-dump/world_state.py b/tools/state-dump/world_state.py index 5c5d5a728..2ff5219f2 100644 --- a/tools/state-dump/world_state.py +++ b/tools/state-dump/world_state.py @@ -1,131 +1,263 @@ #!/usr/bin/env python3 -import json +""" +World-state → genesis.alloc via Erigon-style debug_accountRange (6-arg, 'next'), +pinned to a single block number to avoid 'missing trie node ... loc: diff' errors. + +- Prints and pins the exact block number (hex) for ALL calls. +- Lightweight account paging (no inline code/storage). +- Optional per-contract code and full storage paging. +- Adaptive page sizes for both accounts and storage. +""" + import argparse -import requests +import json +import time from pathlib import Path +from typing import Dict, Any, Optional -# Default filename for the migrated genesis output -DEFAULT_OUT_NAME = "out_genesis.json" +import requests -def rpc_call(rpc_url: str, method: str, params: list) -> dict: - payload = { - "jsonrpc": "2.0", - "id": 1, - "method": method, - "params": params - } - resp = requests.post(rpc_url, json=payload) - resp.raise_for_status() - data = resp.json() - if "error" in data: - raise RuntimeError(f"RPC error ({method}): {data['error']}") - return data["result"] +DEFAULT_OUT = "out_genesis.json" -def rpc_get_block_number(rpc_url: str) -> int: - """Fetch the latest block number (as an int).""" - hex_bn = rpc_call(rpc_url, "eth_blockNumber", []) - return int(hex_bn, 16) +class RpcError(RuntimeError): + pass -def rpc_debug_dump_block(rpc_url: str, block_param: str) -> dict: - """Call debug_dumpBlock at the given block tag or hex number.""" - return rpc_call(rpc_url, "debug_dumpBlock", [block_param]) +def rpc_call(url: str, method: str, params: list, *, timeout: float, retries: int = 0, backoff: float = 1.6) -> Any: + last = None + for i in range(retries + 1): + try: + r = requests.post(url, json={"jsonrpc": "2.0", "id": 1, "method": method, "params": params}, timeout=timeout) + r.raise_for_status() + data = r.json() + if "error" in data: + raise RpcError(f"{method}: {data['error']}") + return data["result"] + except (requests.Timeout, requests.ConnectionError, RpcError) as e: + last = e + if i < retries: + time.sleep(backoff ** i) + else: + raise + raise last # pragma: no cover -def to_hex(x: str) -> str: - """Convert a decimal string to a hex string (0x-prefixed).""" +def to_hex_any(x) -> str: + if x is None: + return "0x0" + if isinstance(x, int): + return hex(x) + if isinstance(x, str): + s = x.strip() + if s.startswith("0x") or s.startswith("0X"): + return s + return hex(int(s)) return hex(int(x)) -def build_alloc(accounts: dict, include_nonces: bool) -> dict: - """ - Given the 'accounts' map from debug_dumpBlock, return an 'alloc' - dictionary suitable for a genesis file: address → { balance, code?, storage?, nonce? }. - """ - alloc = {} - for addr, acct in accounts.items(): - entry = {"balance": to_hex(acct["balance"])} - if include_nonces and acct.get("nonce") is not None: - entry["nonce"] = to_hex(acct["nonce"]) - if acct.get("code"): - entry["code"] = acct["code"] - if acct.get("storage"): - entry["storage"] = acct["storage"] - alloc[addr] = entry +def get_latest_block(url: str, timeout: float) -> int: + h = rpc_call(url, "eth_blockNumber", [], timeout=timeout) + return int(h, 16) + + +def account_range(url: str, block_hex: str, start_token: str, n: int, *, timeout: float) -> Dict[str, Any]: + # Erigon 6-arg: nocode=True, nostorage=True, incompletes=True (lightweight) + return rpc_call(url, "debug_accountRange", [block_hex, start_token, n, True, True, True], timeout=timeout) + + +def storage_range_at(url: str, block_hex: str, addr: str, start_key: str, n: int, *, timeout: float) -> Dict[str, Any]: + # Pin to the same block number (hex). txIndex=None → post-state. + return rpc_call(url, "debug_storageRangeAt", [block_hex, None, addr, start_key, n], timeout=timeout) + + +def eth_get_code(url: str, addr: str, block_hex: str, *, timeout: float) -> str: + # Use the same pinned block number (hex) here as well. + return rpc_call(url, "eth_getCode", [addr, block_hex], timeout=timeout) + + +def page_full_storage( + url: str, + block_hex: str, + addr: str, + *, + timeout: float, + initial_page_size: int = 2048, + min_page_size: int = 128, +) -> Dict[str, str]: + storage: Dict[str, str] = {} + sk = "0x" + page = initial_page_size + while True: + try: + sres = storage_range_at(url, block_hex, addr, sk, page, timeout=timeout) + except RpcError as e: + msg = str(e).lower() + # Adaptive shrink on timeouts + if "timed out" in msg and page > min_page_size: + page = max(min_page_size, page // 2) + print(f"⚠️ storageRangeAt timed out for {addr}; reducing storage page-size to {page} and retrying…") + continue + # Most important: missing trie node usually comes from unpinned or pruned state. + if "missing trie node" in msg: + raise + raise + + s_map = sres.get("storage") or {} + for k, v in s_map.items(): + val = v.get("value", "0x0") + if val not in (None, "0x", "0x0", "0", 0): + storage[k] = val + nxt = sres.get("nextKey") + if not nxt: + break + sk = nxt + return storage + + +def build_alloc( + url: str, + block_hex: str, + *, + initial_page_size: int, + min_page_size: int, + include_nonces: bool, + include_code: bool, + include_storage: bool, + timeout: float, + storage_page_initial: int, + storage_page_min: int, +) -> dict: + alloc: dict = {} + start = "0x" + total = 0 + page_size = initial_page_size + + while True: + # Adaptive account page + while True: + try: + res = account_range(url, block_hex, start, page_size, timeout=timeout) + break + except RpcError as e: + if "timed out" in str(e).lower() and page_size > min_page_size: + page_size = max(min_page_size, page_size // 2) + print(f"⚠️ accountRange timed out; reducing account page-size to {page_size} and retrying…") + continue + raise + + accounts = res.get("accounts") or {} + + for addr, meta in accounts.items(): + entry = {"balance": to_hex_any(meta.get("balance", "0"))} + if include_nonces and (meta.get("nonce") is not None): + entry["nonce"] = to_hex_any(meta["nonce"]) + + # Code / storage (per-contract) + if include_code or include_storage: + code = eth_get_code(url, addr, block_hex, timeout=timeout) + if code and code != "0x": + if include_code: + entry["code"] = code + if include_storage: + try: + storage = page_full_storage( + url, + block_hex, + addr, + timeout=timeout, + initial_page_size=storage_page_initial, + min_page_size=storage_page_min, + ) + if storage: + entry["storage"] = storage + except RpcError as e: + # If still missing trie nodes at a fixed block, node likely pruned / missing state. + if "missing trie node" in str(e).lower(): + print(f"❌ storageRangeAt failed for {addr} due to missing trie node at pinned block. " + f"Your node likely lacks full state for storage paging (needs archive-like state).") + print(" → Options: (1) re-run without --include-storage, " + "or (2) use an archive/fully-synced node with state available at this block.") + # Continue with other accounts; keep code/balance/nonce + else: + raise + + alloc[addr] = entry + total += 1 + + nxt = res.get("next") + if not nxt: + break + start = nxt + + print(f" ↳ paged: {total} accounts") return alloc def main(): - p = argparse.ArgumentParser( - description="Snapshot world-state via debug_dumpBlock and merge into a genesis template" - ) - p.add_argument( - "--rpc", - default="http://127.0.0.1:8545", - help="Geth RPC endpoint" - ) - p.add_argument( - "--input-genesis", - required=True, - help="Path to your source chain genesis.json" - ) - p.add_argument( - "--block", "-b", - type=int, - help="Block number to snapshot (defaults to latest)" - ) - p.add_argument( - "--output", "-o", - help=( - "Path or directory for output genesis JSON " - f"(default: ./{DEFAULT_OUT_NAME})" - ) - ) - p.add_argument( - "--include-nonces", - action="store_true", - help="Include account nonces in the alloc entries" - ) - args = p.parse_args() + ap = argparse.ArgumentParser(description="World-state → genesis.alloc (Erigon accountRange) pinned to a block, with adaptive paging.") + ap.add_argument("--rpc", required=True) + ap.add_argument("--input-genesis", required=True) + ap.add_argument("--output", "-o", help=f"Output path or directory (default ./{DEFAULT_OUT})") + ap.add_argument("--block", "-b", type=int, help="Block number (default: latest)") + ap.add_argument("--exclude-nonces", action="store_true") + ap.add_argument("--page-size", type=int, default=2048) + ap.add_argument("--min-page-size", type=int, default=256) + ap.add_argument("--include-code", action="store_true") + ap.add_argument("--include-storage", action="store_true") # implies include-code if contract present + ap.add_argument("--rpc-timeout", type=float, default=300.0) + ap.add_argument("--storage-page-size", type=int, default=2048, help="Initial storage page size") + ap.add_argument("--storage-min-page-size", type=int, default=128, help="Minimum storage page size") + args = ap.parse_args() + + include_nonces = not args.exclude_nonces + include_code = bool(args.include_code or args.include_storage) + include_storage = bool(args.include_storage) - # load template - tmpl_path = Path(args.input_genesis) - if not tmpl_path.is_file(): - raise SystemExit(f"❌ Input genesis not found: {tmpl_path}") - genesis_tpl = json.loads(tmpl_path.read_text()) + # Load template + tpl_path = Path(args.input_genesis) + if not tpl_path.is_file(): + raise SystemExit(f"❌ Input genesis not found: {tpl_path}") + genesis_tpl = json.loads(tpl_path.read_text()) - # decide which block to dump + # Resolve and PIN the block to a constant hex number if args.block is None: - block_no = rpc_get_block_number(args.rpc) - block_param = "latest" - print(f"⛓ Dumping world-state at latest (block {block_no})…") + bn = get_latest_block(args.rpc, args.rpc_timeout) else: - block_no = args.block - block_param = hex(block_no) - print(f"⛓ Dumping world-state at block {block_no}…") - - # fetch the dump - dump = rpc_debug_dump_block(args.rpc, block_param) - print(f" ↳ {len(dump['accounts'])} accounts loaded") - - # build alloc → merge → write out - latest_alloc = build_alloc(dump["accounts"], args.include_nonces) - new_gen = genesis_tpl.copy() - orig_alloc = new_gen.get("alloc", {}) - new_gen["alloc"] = {**orig_alloc, **latest_alloc} - - # determine output path - if args.output: - out_path = Path(args.output) - if out_path.is_dir(): - out_path = out_path / DEFAULT_OUT_NAME - else: - out_path = Path.cwd() / DEFAULT_OUT_NAME + bn = args.block + block_hex = hex(bn) + + print(f"⛓ Scanning state pinned at block {bn} (tag {block_hex}, account page {args.page_size})…") + if include_storage: + print(" • Including contract storage (per-account paging). This may take a while.") + # Build alloc + alloc = build_alloc( + args.rpc, + block_hex, + initial_page_size=args.page_size, + min_page_size=args.min_page_size, + include_nonces=include_nonces, + include_code=include_code, + include_storage=include_storage, + timeout=args.rpc_timeout, + storage_page_initial=args.storage_page_size, + storage_page_min=args.storage_min_page_size, + ) + + # Merge & write + new_gen = dict(genesis_tpl) + base_alloc = dict(new_gen.get("alloc") or {}) + base_alloc.update(alloc) + new_gen["alloc"] = base_alloc + + out_path = Path(args.output) if args.output else Path.cwd() / DEFAULT_OUT + if out_path.is_dir(): + out_path = out_path / DEFAULT_OUT out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(new_gen, indent=2)) print(f"✅ Wrote merged genesis → {out_path}") + if __name__ == "__main__": main()