diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..ec57aae3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tools/docusaurus-link-checker"] + path = tools/docusaurus-link-checker + url = https://github.com/ethersphere/docusaurus-link-checker.git diff --git a/README.md b/README.md index 98d82f7a..f5f2815b 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,45 @@ The script is **informational only** (exit 0) — it won't block the build. A few pages are intentionally excluded (intro/landing pages that only contain navigation cards). Their warnings are expected and can be ignored. +## Link Checker + +This repo includes [ethersphere/docusaurus-link-checker](https://github.com/ethersphere/docusaurus-link-checker) as a git submodule at `tools/docusaurus-link-checker`. After cloning, initialise it with: + +```bash +git submodule update --init +``` + +### Usage + +Run the checker from the repo root: + +```bash +npm run check:links +``` + +You will be prompted to choose local or live mode. Flags are passed through after `--`: + +```bash +npm run check:links -- --mode local +npm run check:links -- --mode live --site-domain docs.ethswarm.org +npm run check:links -- --mode local --no-external --threads 16 +``` + +| Flag | Description | +|---|---| +| `--mode local\|live` | Local build check (default) or live site crawl | +| `--site-domain` | Your site's domain — auto-detected from `docusaurus.config.*` if omitted | +| `--no-external` | Skip external URL checking (local mode only) | +| `--threads N` | Number of concurrent HTTP threads (default: 8) | + +To run the full build and then immediately check links: + +```bash +npm run build:check +``` + +Reports are written to `link-reports/` (gitignored). + ## Bumping Version Don't forget to find and replace the version number for the whole of the docs folder. diff --git a/package.json b/package.json index 230e80b8..837f4743 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,9 @@ "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "serve": "docusaurus serve", - "check:links": "python scripts/check_links.py" + "precheck:links": "git submodule update --remote tools/docusaurus-link-checker", + "check:links": "python tools/docusaurus-link-checker/check_links.py", + "build:check": "npm run build && python tools/docusaurus-link-checker/check_links.py" }, "dependencies": { "@docsearch/js": "^4.3.2", diff --git a/scripts/check_links.py b/scripts/check_links.py deleted file mode 100644 index 00d6a7d4..00000000 --- a/scripts/check_links.py +++ /dev/null @@ -1,1444 +0,0 @@ -#!/usr/bin/env python3 -""" -Link checker for bee-docs Docusaurus site. -Checks: - 1. Internal links in source docs (markdown/mdx) — verified against the BUILD output: - • page existence checked by looking up the corresponding HTML file in build/ - • anchor existence checked by reading actual id attributes from rendered HTML - No slug inference — only what Docusaurus actually produced is trusted. - 2. Internal links in build output (HTML) — file existence + anchors - 3. External links in source docs — real HTTP requests (HEAD/GET) - -Requirements: run 'npm run build' before running this script so build/ is current. - -Usage: - python scripts/check_links.py [--no-external] [--threads N] - npm run check:links -""" - -import os -import re -import sys -import time -import threading -import queue -import subprocess -from pathlib import Path - -# On Windows, npm is a .cmd script and must be invoked as 'npm.cmd' -_NPM = 'npm.cmd' if sys.platform == 'win32' else 'npm' -from html.parser import HTMLParser -from urllib.parse import urlparse, unquote -from collections import defaultdict -from urllib.request import Request, urlopen, HTTPRedirectHandler, build_opener -from urllib.error import URLError, HTTPError -import http.client -import socket -import argparse - -# ───────────────────────────────────────────── -# Configuration -# ───────────────────────────────────────────── - -PROJECT_DIR = Path(__file__).resolve().parent.parent -DOCS_DIR = PROJECT_DIR / "docs" -BUILD_DIR = PROJECT_DIR / "build" -STATIC_DIR = PROJECT_DIR / "static" -REPORT_PATH = PROJECT_DIR / "link-reports/dead_links_report.md" -HUMAN_REPORT_PATH = PROJECT_DIR / "link-reports/dead_links_audit.md" - -# The live domain — full-URL links using this domain are treated as internal -# and checked against the local build directory instead of via HTTP. -SITE_DOMAIN = "docs.ethswarm.org" -SITE_BASE_URL = f"https://{SITE_DOMAIN}" - -# External link checker settings -EXT_TIMEOUT = 15 # seconds per request -EXT_THREADS = 8 # concurrent HTTP workers -EXT_DELAY = 0.15 # seconds between requests per thread (politeness) - -USER_AGENT = ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/122.0 Safari/537.36 bee-docs-link-checker/2.0" -) - -# Schemes to collect for external checking (everything http/https) -EXTERNAL_SCHEMES = ("http://", "https://") -# Schemes to ignore entirely -IGNORE_SCHEMES = ("mailto:", "javascript:", "tel:", "ftp:", "data:") - -# Internal path prefixes where anchor checking is skipped (JS-rendered pages) -SKIP_ANCHOR_PATHS = ( - "/api/", - "/api#", -) - -# Hostnames/prefixes to skip — example/placeholder URLs in documentation -IGNORE_HOSTS = ( - "localhost", - "127.0.0.1", - "192.168.", - "10.0.", - "0.0.0.0", -) - - -# ───────────────────────────────────────────── -# Helpers — markdown link extraction -# ───────────────────────────────────────────── - -def strip_code_blocks(content): - content = re.sub(r'', '', content) # HTML comments - content = re.sub(r'```[^\n]*\n[\s\S]*?```', '', content) - content = re.sub(r'~~~[^\n]*\n[\s\S]*?~~~', '', content) - content = re.sub(r'`[^`\n]+`', '', content) - return content - - -def extract_md_links(content): - """Return list of (link_text, url) from markdown content.""" - content = strip_code_blocks(content) - links = [] - - # Inline links: [text](url) or [text](url "title") - # The URL pattern allows balanced parentheses (e.g. Wikipedia URLs like /wiki/APT_(software)) - for m in re.finditer(r'\[([^\]]*)\]\(((?:[^)(]|\([^)]*\))*?)(?:\s+"[^"]*")?\)', content): - url = m.group(2).strip().split('"')[0].strip().split("'")[0].strip() - links.append((m.group(1), url)) - - # Reference-style definitions - ref_defs = {} - for m in re.finditer(r'^\[([^\]]+)\]:\s*(\S+)', content, re.MULTILINE): - ref_defs[m.group(1).lower()] = m.group(2) - - # Reference-style uses - for m in re.finditer(r'\[([^\]]+)\]\[([^\]]*)\]', content): - text = m.group(1) - ref = m.group(2).lower() if m.group(2) else text.lower() - if ref in ref_defs: - links.append((text, ref_defs[ref])) - - # HTML anchors and images in markdown - for m in re.finditer(r']*href=["\']([^"\']+)["\']', content, re.IGNORECASE): - links.append(('', m.group(1))) - for m in re.finditer(r']*src=["\']([^"\']+)["\']', content, re.IGNORECASE): - links.append(('', m.group(1))) - - # Bare URLs — plain http(s) URLs not inside a markdown link or HTML attribute. - # Collect all URL positions already captured above to avoid double-reporting. - seen_spans = set() - for m in re.finditer(r'\[([^\]]*)\]\(([^)]+)\)', content): - seen_spans.add(m.start(2)) - for m in re.finditer(r'^\[([^\]]+)\]:\s*(\S+)', content, re.MULTILINE): - seen_spans.add(m.start(2)) - for m in re.finditer(r'(?:href|src)=["\']([^"\']+)["\']', content, re.IGNORECASE): - seen_spans.add(m.start(1)) - - for m in re.finditer(r'https?://[^\s\]>"\'\\<*`]+', content): - if m.start() not in seen_spans: - url = m.group(0).rstrip('.,;:!') - # Strip trailing unbalanced close-parens - while url.endswith(')') and url.count('(') < url.count(')'): - url = url[:-1] - links.append(('', url)) - - return links - - -# ───────────────────────────────────────────── -# Helpers — build-output link resolution -# ───────────────────────────────────────────── - -def _frontmatter_id(md_file): - """Return the 'id' value from YAML frontmatter, or None.""" - try: - text = md_file.read_text(encoding='utf-8', errors='replace') - if not text.startswith('---'): - return None - end = text.find('\n---', 3) - if end == -1: - return None - for line in text[3:end].splitlines(): - if line.startswith('id:'): - return line[3:].strip().strip('"\'') - except Exception: - pass - return None - - -def _build_docid_map(): - """ - Scan all HTML files in the build and return a dict {doc_id: html_path}. - - Docusaurus embeds the doc ID in the class as 'docs-doc-id-{id}', - e.g. class="... docs-doc-id-concepts/DISC/disc ...". - This is the ground truth for what page is at what path — no inference needed. - """ - mapping = {} - if not BUILD_DIR.exists(): - return mapping - for html_file in BUILD_DIR.rglob('index.html'): - try: - # Only read the opening tag (first ~500 bytes) for performance - with html_file.open(encoding='utf-8', errors='replace') as fh: - head = fh.read(800) - m = re.search(r'docs-doc-id-([^\s"\']+)', head) - if m: - mapping[m.group(1)] = html_file - except Exception: - pass - return mapping - - -# Populated once at first call to md_path_to_build_html() -_DOCID_MAP = None - - -def md_path_to_build_html(md_file): - """Map a source .md/.mdx file to the HTML file Docusaurus built from it. - - Uses the build's own HTML files (via the embedded docs-doc-id class) as the - authoritative source — no path inference or slug computation. - - Falls back to a computed path when the build map lookup misses. - """ - global _DOCID_MAP - if _DOCID_MAP is None: - _DOCID_MAP = _build_docid_map() - - try: - rel = md_file.relative_to(DOCS_DIR) - except ValueError: - return None - - # Compute the full doc ID: parent/local_id - local_id = _frontmatter_id(md_file) or rel.with_suffix('').name - parent = str(rel.parent).replace('\\', '/') - doc_id = local_id if parent == '.' else f"{parent}/{local_id}" - - # Look up in the reverse map first (authoritative) - if doc_id in _DOCID_MAP: - return _DOCID_MAP[doc_id] - - # Fallback: compute expected path - parent_path = rel.parent - if local_id == 'index': - return BUILD_DIR / 'docs' / parent_path / 'index.html' - return BUILD_DIR / 'docs' / parent_path / local_id / 'index.html' - - -def resolve_internal_to_build_html(source_md, link_path): - """Resolve an internal (non-http) link path to the build HTML file it corresponds to. - - Checks the build/ directory only — no slug inference, no source-file guessing. - Returns (html_path_or_None, error_reason_or_None). - Caller is responsible for splitting off any '#anchor' before calling. - """ - decoded = unquote(link_path) - - # ── Absolute path (/docs/… or /static/…) ── - if decoded.startswith('/'): - rel = decoded.lstrip('/') - candidates = [ - BUILD_DIR / rel, - BUILD_DIR / rel / 'index.html', - BUILD_DIR / (rel + '.html'), - ] - for c in candidates: - if c.exists() and c.is_file(): - return c, None - return None, f"Not found in build: /{rel}" - - # ── Relative path ── - target = (source_md.parent / decoded).resolve() - - # Non-markdown file (image, PDF, asset): check static/ and on-disk path - if target.suffix not in ('', '.md', '.mdx'): - if target.exists(): - return target, None - try: - static_candidate = STATIC_DIR / target.relative_to(PROJECT_DIR) - if static_candidate.exists(): - return static_candidate, None - except ValueError: - pass - return None, f"File not found: {target.name}" - - # Markdown / no extension: find source file → map to build HTML - md_candidates = ( - [target] if target.suffix in ('.md', '.mdx') - else [target.with_suffix('.md'), target.with_suffix('.mdx'), - target / 'index.md', target / 'index.mdx'] - ) - for md_cand in md_candidates: - if md_cand.exists() and md_cand.is_file(): - build_html = md_path_to_build_html(md_cand) - if build_html is None: - return None, "Could not map source file to build path" - if build_html.exists(): - return build_html, None - return None, "Source file exists but its build HTML was not found — is the build current?" - - return None, "Source file not found" - - -def resolve_site_url_locally(url): - """Check a full docs.ethswarm.org URL against the local build output.""" - parsed = urlparse(url) - rel = parsed.path.rstrip('/').lstrip('/') - candidates = [ - BUILD_DIR / rel, - BUILD_DIR / rel / 'index.html', - BUILD_DIR / (rel + '.html'), - ] - for c in candidates: - if c.exists() and c.is_file(): - return True, str(c) - return False, str(BUILD_DIR / rel) - - -# ───────────────────────────────────────────── -# External URL checker -# ───────────────────────────────────────────── - -EXT_STATUS_OK = 'ok' -EXT_STATUS_404 = '404' -EXT_STATUS_DOWN = 'down' -EXT_STATUS_REDIRECT = 'redirect' -EXT_STATUS_ERROR = 'error' -EXT_STATUS_INTERNAL = 'internal_404' # full site URL that resolves locally but build says 404 - - -class _NoFollowRedirectHandler(HTTPRedirectHandler): - """Prevent urllib from automatically following redirects.""" - def redirect_request(self, req, fp, code, msg, headers, newurl): - return None # returning None makes urllib raise HTTPError with the 3xx code - - -def _build_no_redirect_opener(): - return build_opener(_NoFollowRedirectHandler()) - - -def _fetch(url, headers, method='HEAD', follow_redirects=False): - """ - Make a single HTTP request. - - follow_redirects=False: do not follow redirects; 3xx responses return the - code and Location header so the caller can decide what to do. - follow_redirects=True: follow the full redirect chain (standard urlopen behaviour). - - Returns (status_code_or_None, final_url, location_header_or_None, error_str_or_None). - """ - try: - req = Request(url, headers=headers, method=method) - if follow_redirects: - with urlopen(req, timeout=EXT_TIMEOUT) as resp: - return resp.status, resp.url, None, None - else: - opener = _build_no_redirect_opener() - with opener.open(req, timeout=EXT_TIMEOUT) as resp: - return resp.status, url, resp.headers.get('Location'), None - except HTTPError as e: - loc = e.headers.get('Location') if hasattr(e, 'headers') and e.headers else None - return e.code, url, loc, None - except (URLError, socket.timeout, socket.error, ConnectionRefusedError, - http.client.RemoteDisconnected, http.client.IncompleteRead) as e: - return None, url, None, str(e) - except Exception as e: - return None, url, None, f'{type(e).__name__}: {str(e)[:80]}' - - -def _classify_connection_error(result, err): - """Populate result with the right status for a network-level error string.""" - if 'ECONNREFUSED' in err or 'Connection refused' in err: - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = 'ECONNREFUSED — server down' - elif ('Name or service not known' in err or 'getaddrinfo' in err - or 'nodename' in err.lower() or 'No address' in err): - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = 'DNS resolution failed' - elif 'timed out' in err.lower() or 'timeout' in err.lower(): - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = 'Connection timed out' - elif 'SSL' in err or 'ssl' in err: - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = f'SSL error: {err[:80]}' - else: - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = f'Connection error: {err[:80]}' - return result - - -def _check_destination(dest_url, headers): - """ - Verify that a redirect destination is actually reachable (200). - Follows the full redirect chain from dest_url. - Returns (status_code_or_None, final_url, error_str_or_None). - """ - code, final, _, err = _fetch(dest_url, headers, method='HEAD', follow_redirects=True) - if err: - return None, dest_url, err - if code in (403, 405): - # Some servers reject HEAD — retry with GET - code, final, _, err = _fetch(dest_url, headers, method='GET', follow_redirects=True) - if err: - return None, dest_url, err - return code, final or dest_url, None - - -def check_external_url(url): - """ - Check a single external URL. - - Strategy: - 1. HEAD request WITHOUT following redirects so we can see whether - the URL itself redirects (and where). - 2. If 3xx: explicitly fetch the redirect destination and verify it - returns 200. Only report as EXT_STATUS_REDIRECT if the destination - is reachable. A redirect that leads to a 404/down is reported as - the appropriate broken status. - 3. If HEAD is rejected (403/405): retry with GET, same logic. - - Returns dict: {url, status, http_code, final_url, error_msg} - """ - result = { - 'url': url, - 'status': EXT_STATUS_ERROR, - 'http_code': None, - 'final_url': None, - 'error_msg': None, - } - - # Special case: links to our own live site — check against local build - parsed = urlparse(url) - if parsed.netloc == SITE_DOMAIN: - exists, tried = resolve_site_url_locally(url) - if exists: - result['status'] = EXT_STATUS_OK - else: - result['status'] = EXT_STATUS_INTERNAL - result['error_msg'] = f"Not in local build: {tried}" - return result - - headers = { - 'User-Agent': USER_AGENT, - 'Accept': 'text/html,application/xhtml+xml,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.9', - } - - # ── Step 1: initial request (no auto-redirect) ── - code, _, location, err = _fetch(url, headers, method='HEAD', follow_redirects=False) - - if err: - return _classify_connection_error(result, err) - - # HEAD rejected → retry with GET (no auto-redirect) - if code in (403, 405): - code, _, location, err = _fetch(url, headers, method='GET', follow_redirects=False) - if err: - return _classify_connection_error(result, err) - if code in (403, 405): - result['status'] = EXT_STATUS_ERROR - result['http_code'] = code - result['error_msg'] = f"HTTP {code} (GET retry)" - result['final_url'] = url - return result - - result['http_code'] = code - - # ── Step 2: classify based on response code ── - if code is None: - result['status'] = EXT_STATUS_ERROR - return result - - if code == 200: - result['status'] = EXT_STATUS_OK - result['final_url'] = url - - elif code == 404: - result['status'] = EXT_STATUS_404 - result['error_msg'] = 'HTTP 404' - result['final_url'] = url - - elif code in (301, 302, 303, 307, 308): - # ── Redirect: verify the destination is actually reachable ── - dest = location or url - # Make dest absolute if it's a relative Location header - if dest and not dest.startswith('http'): - p = urlparse(url) - dest = f"{p.scheme}://{p.netloc}{dest}" - - dest_code, dest_final, dest_err = _check_destination(dest, headers) - - if dest_err: - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = f"Redirect to {dest!r} failed: {dest_err[:80]}" - result['final_url'] = dest - elif dest_code is None: - result['status'] = EXT_STATUS_DOWN - result['error_msg'] = f"Redirect destination unreachable: {dest!r}" - result['final_url'] = dest - elif dest_code == 200: - if _urls_differ_meaningfully(url, dest_final): - result['status'] = EXT_STATUS_REDIRECT - result['final_url'] = dest_final - else: - result['status'] = EXT_STATUS_OK - result['final_url'] = dest_final - elif dest_code == 404: - result['status'] = EXT_STATUS_404 - result['error_msg'] = f"Redirect target returned 404 ({dest!r})" - result['final_url'] = dest - else: - result['status'] = EXT_STATUS_ERROR - result['error_msg'] = f"Redirect target returned HTTP {dest_code}" - result['final_url'] = dest - - else: - # Any other 2xx is fine; other codes treated as errors - if 200 <= code < 300: - result['status'] = EXT_STATUS_OK - result['final_url'] = url - else: - result['status'] = EXT_STATUS_ERROR - result['error_msg'] = f"HTTP {code}" - result['final_url'] = url - - return result - - -def _urls_differ_meaningfully(original, final): - """True if the URLs differ in a way that's worth reporting (not just http→https or trailing slash).""" - if not final or original == final: - return False - o = urlparse(original) - f = urlparse(final) - o_path = o.path.rstrip('/') - f_path = f.path.rstrip('/') - # Same host+path, only scheme or trailing-slash differs → not meaningful - if o.netloc == f.netloc and o_path == f_path and o.query == f.query: - return False - # http → https upgrade on same host/path → not meaningful - if (o.netloc == f.netloc and o_path == f_path - and o.scheme == 'http' and f.scheme == 'https'): - return False - return True - - -def check_external_urls_threaded(url_to_sources, threads=EXT_THREADS): - """ - Check a dict of {url: [source_files]} concurrently. - Returns dict of {url: check_result_dict}. - """ - urls = list(url_to_sources.keys()) - results = {} - lock = threading.Lock() - q = queue.Queue() - - for url in urls: - q.put(url) - - total = len(urls) - done = [0] - - def worker(): - while True: - try: - url = q.get_nowait() - except queue.Empty: - break - time.sleep(EXT_DELAY) - res = check_external_url(url) - with lock: - results[url] = res - done[0] += 1 - n = done[0] - if n % 10 == 0 or n == total: - print(f" External: {n}/{total} checked...", end='\r', flush=True) - q.task_done() - - thread_list = [threading.Thread(target=worker, daemon=True) for _ in range(min(threads, len(urls)))] - for t in thread_list: - t.start() - for t in thread_list: - t.join() - - print() # newline after \r progress - return results - - -# ───────────────────────────────────────────── -# Markdown file checker -# ───────────────────────────────────────────── - -def check_markdown_files(check_external=True): - """ - Scan all .md/.mdx source files. - - Internal links are verified against the BUILD output: - - page existence: does the corresponding build HTML file exist? - - anchor existence: is the anchor present as an id attribute in the rendered HTML? - No slug inference is performed at any point. - - Returns: - - broken_internal: list of broken internal link dicts - - external_url_to_sources: dict {url: [(source_file, link_text)]} - - stats - """ - broken_internal = [] - external_url_to_src = defaultdict(list) - files_checked = 0 - links_checked = 0 - html_id_cache = {} # str(html_path) → frozenset of id strings - - if not BUILD_DIR.exists(): - print(" WARNING: build/ directory not found.") - print(" Run 'npm run build' first — internal links cannot be checked without it.") - - md_files = sorted(list(DOCS_DIR.rglob('*.md')) + list(DOCS_DIR.rglob('*.mdx'))) - - for md_file in md_files: - files_checked += 1 - try: - content = md_file.read_text(encoding='utf-8', errors='replace') - except Exception as e: - broken_internal.append({ - 'source': str(md_file), 'link_text': '', 'link_url': '', - 'resolved': '', 'reason': f'Could not read file: {e}', - }) - continue - - # Build HTML for this source file — used for anchor-only (#frag) links - source_build_html = md_path_to_build_html(md_file) - - links = extract_md_links(content) - - for link_text, url in links: - url = url.strip() - if not url or url == '#': - continue - if any(url.startswith(s) for s in IGNORE_SCHEMES): - continue - - parsed_url = urlparse(url) - if any(parsed_url.hostname and parsed_url.hostname.startswith(h) for h in IGNORE_HOSTS): - continue - - links_checked += 1 - - # ── External / self-site links ── - if any(url.startswith(s) for s in EXTERNAL_SCHEMES): - if check_external: - external_url_to_src[url].append((str(md_file), link_text)) - continue - - # ── Split anchor from path ── - anchor = None - link_path = url - if '#' in link_path: - link_path, anchor = link_path.split('#', 1) - - # ── Determine target build HTML ── - if not link_path: - # Anchor-only link — same page - target_html = source_build_html - else: - target_html, reason = resolve_internal_to_build_html(md_file, link_path) - if reason or target_html is None or not target_html.exists(): - broken_internal.append({ - 'source': str(md_file), - 'link_text': link_text, - 'link_url': url, - 'resolved': str(target_html) if target_html else link_path, - 'reason': reason or 'Build HTML not found', - }) - continue - - # ── Check anchor in rendered HTML ── - if anchor and any(url.startswith(p) for p in SKIP_ANCHOR_PATHS): - continue # JS-rendered page — anchor not in static HTML - if anchor and target_html and target_html.exists(): - key = str(target_html) - if key not in html_id_cache: - html_id_cache[key] = get_html_ids(target_html) - if anchor not in html_id_cache[key]: - broken_internal.append({ - 'source': str(md_file), - 'link_text': link_text, - 'link_url': url, - 'resolved': f'{target_html}#{anchor}', - 'reason': f'Anchor "#{anchor}" not found in rendered HTML', - }) - - return broken_internal, dict(external_url_to_src), files_checked, links_checked, len(md_files) - - -# ───────────────────────────────────────────── -# HTML build checker -# ───────────────────────────────────────────── - -class LinkExtractor(HTMLParser): - def __init__(self): - super().__init__() - self.links = [] - self.ids = set() - - def handle_starttag(self, tag, attrs): - attrs_dict = dict(attrs) - if 'id' in attrs_dict: - self.ids.add(attrs_dict['id']) - if tag == 'a' and 'href' in attrs_dict: - self.links.append(('href', attrs_dict['href'])) - elif tag in ('img', 'script') and 'src' in attrs_dict: - self.links.append(('src', attrs_dict['src'])) - elif tag == 'link' and 'href' in attrs_dict: - self.links.append(('href', attrs_dict['href'])) - - -def get_html_ids(html_file): - try: - content = html_file.read_text(encoding='utf-8', errors='replace') - except Exception: - return set() - parser = LinkExtractor() - parser.feed(content) - return parser.ids - - -def resolve_html_link(source_html, href, build_root): - anchor = None - if '#' in href: - href, anchor = href.split('#', 1) - - href = unquote(href) - if not href: - return None, anchor, None - - if href.startswith('/'): - rel = href.lstrip('/') - target = build_root / rel - candidates = [target] - if target.suffix == '': - candidates.append(target / 'index.html') - else: - source_dir = source_html.parent - target = (source_dir / href).resolve() - candidates = [target] - if target.suffix == '': - candidates.append(target / 'index.html') - - for c in candidates: - if c.exists() and c.is_file(): - return c, anchor, None - return target, anchor, "File not found" - - -def check_html_files(): - broken = [] - files_checked = 0 - links_checked = 0 - id_cache = {} - - html_files = sorted(BUILD_DIR.rglob('*.html')) - - for html_file in html_files: - files_checked += 1 - try: - content = html_file.read_text(encoding='utf-8', errors='replace') - except Exception as e: - broken.append({'source': str(html_file), 'attr': 'href', 'link_url': '', - 'resolved': '', 'reason': f'Could not read: {e}'}) - continue - - parser = LinkExtractor() - parser.feed(content) - file_ids = parser.ids - - for attr, url in parser.links: - url = url.strip() - if not url or url == '#': - continue - if any(url.startswith(s) for s in EXTERNAL_SCHEMES + IGNORE_SCHEMES + ('data:',)): - continue - - links_checked += 1 - - if url.startswith('#'): - anchor = url[1:] - if anchor and anchor not in file_ids: - broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, - 'resolved': f'{html_file}#{anchor}', - 'reason': f'Anchor "#{anchor}" not found in same page'}) - continue - - resolved, anchor, reason = resolve_html_link(html_file, url, BUILD_DIR) - - if reason: - broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, - 'resolved': str(resolved) if resolved else url, 'reason': reason}) - continue - - if anchor and resolved and resolved.exists(): - key = str(resolved) - if key not in id_cache: - id_cache[key] = get_html_ids(resolved) - if anchor not in id_cache[key]: - broken.append({'source': str(html_file), 'attr': attr, 'link_url': url, - 'resolved': f'{resolved}#{anchor}', - 'reason': f'Anchor "#{anchor}" not found in target HTML'}) - - return broken, files_checked, links_checked, len(html_files) - - -# ───────────────────────────────────────────── -# Deduplication -# ───────────────────────────────────────────── - -def deduplicate_html_broken(broken): - groups = defaultdict(list) - for item in broken: - groups[(item['link_url'], item['reason'])].append(item) - result = [] - for (url, reason), items in sorted(groups.items()): - rep = dict(items[0]) - rep['count'] = len(items) - rep['example_sources'] = [it['source'] for it in items[:3]] - result.append(rep) - return result - - -# ───────────────────────────────────────────── -# Report -# ───────────────────────────────────────────── - -def make_short_path(path_str, base): - try: - return str(Path(path_str).relative_to(base)) - except ValueError: - try: - return str(Path(path_str).relative_to(PROJECT_DIR)) - except ValueError: - return path_str - - -def write_report( - md_broken, ext_results, ext_url_to_src, - md_files_checked, md_links_checked, md_total_files, - html_broken, html_files_checked, html_links_checked, html_total_files, - staged_replacements=None, -): - import datetime - today = datetime.date.today().isoformat() - - # Categorise external results - ext_404 = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_404} - ext_down = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_DOWN} - ext_redirect = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_REDIRECT} - ext_internal = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_INTERNAL} - ext_error = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_ERROR} - - _staged = staged_replacements or {} - - def _repl(url, res=None): - if url in _staged: - return _staged[url] - final = (res or {}).get('final_url') or '' - return final if final and final != url else '' - - deduped_html = deduplicate_html_broken(html_broken) - - lines = [] - lines.append("# Dead Links Report\n") - lines.append(f"Generated: {today}\n") - lines.append("") - - # ── Summary ── - lines.append("## Summary\n") - lines.append("| Category | Count |") - lines.append("|---|---|") - lines.append(f"| Source doc files checked | {md_files_checked} / {md_total_files} |") - lines.append(f"| Internal links checked (source) | {md_links_checked} |") - lines.append(f"| **Broken internal links (source)** | **{len(md_broken)}** |") - lines.append(f"| External URLs checked | {len(ext_results)} |") - lines.append(f"| **External 404s** | **{len(ext_404) + len(ext_internal)}** |") - lines.append(f"| **External down / refused** | **{len(ext_down)}** |") - lines.append(f"| **Stale redirects** | **{len(ext_redirect)}** |") - lines.append(f"| External errors (timeout/misc) | {len(ext_error)} |") - lines.append(f"| Build HTML files checked | {html_files_checked} / {html_total_files} |") - lines.append(f"| **Broken links in build output** | **{len(deduped_html)} patterns** |") - lines.append("") - - # ── Section 1: Internal broken links ── - lines.append("---\n") - lines.append("## Section 1: Broken Internal Links in Source Docs\n") - - if not md_broken: - lines.append("_No broken internal links._\n") - else: - by_file = defaultdict(list) - for item in md_broken: - by_file[item['source']].append(item) - for source in sorted(by_file): - short = make_short_path(source, DOCS_DIR) - lines.append(f"### `{short}`\n") - lines.append("| Link Text | URL | Resolved Path | Reason |") - lines.append("|---|---|---|---|") - for item in by_file[source]: - text = item['link_text'].replace('|', '\\|')[:60] - url = item['link_url'].replace('|', '\\|')[:80] - resolved = make_short_path(item['resolved'], DOCS_DIR).replace('|', '\\|')[:100] - reason = item['reason'].replace('|', '\\|') - lines.append(f"| {text} | `{url}` | `{resolved}` | {reason} |") - lines.append("") - - # ── Section 2: External 404s ── - lines.append("---\n") - lines.append("## Section 2: External 404s\n") - - all_404 = {**ext_404, **ext_internal} - if not all_404: - lines.append("_No external 404s found._\n") - else: - lines.append("| URL | Notes | Instances (Link Text — File) |") - lines.append("|---|---|---|") - for url, res in sorted(all_404.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - code_str = f"HTTP {res['http_code']}" if res['http_code'] else (res['error_msg'] or '') - if res['status'] == EXT_STATUS_INTERNAL: - code_str = "Not found in local build" - lines.append(f"| `{url[:100]}` | {code_str} | {instances} |") - lines.append("") - - # ── Section 3: Down / refused ── - lines.append("---\n") - lines.append("## Section 3: Down / Connection Refused\n") - - if not ext_down: - lines.append("_No unreachable external links._\n") - else: - lines.append("| URL | Error | Instances (Link Text — File) |") - lines.append("|---|---|---|") - for url, res in sorted(ext_down.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - err = res.get('error_msg', '') or '' - lines.append(f"| `{url[:100]}` | {err} | {instances} |") - lines.append("") - - # ── Section 4: Stale redirects ── - lines.append("---\n") - lines.append("## Section 4: Stale Redirects (Update to Final URL)\n") - - if not ext_redirect: - lines.append("_No stale redirects found._\n") - else: - lines.append("| Original URL | Instances (Link Text — File) |") - lines.append("|---|---|") - for url, res in sorted(ext_redirect.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - lines.append(f"| `{url[:80]}` | {instances} |") - lines.append("") - - # ── Section 5: Errors / timeouts ── - if ext_error: - lines.append("---\n") - lines.append("## Section 5: External Check Errors (timeout / misc)\n") - lines.append("| URL | Error | Instances (Link Text — File) |") - lines.append("|---|---|---|") - for url, res in sorted(ext_error.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - err = res.get('error_msg', '') or '' - lines.append(f"| `{url[:100]}` | {err} | {instances} |") - lines.append("") - - # ── Section 6: Build HTML broken links ── - lines.append("---\n") - lines.append("## Section 6: Broken Links in Build Output\n") - lines.append("_Deduplicated by (url, reason) pattern._\n") - - if not deduped_html: - lines.append("_No broken links in build output._\n") - else: - lines.append("| Count | URL | Reason | Example Source |") - lines.append("|---|---|---|---|") - for item in sorted(deduped_html, key=lambda x: -x['count']): - url = item['link_url'].replace('|', '\\|')[:80] - reason = item['reason'].replace('|', '\\|') - example = make_short_path(item['example_sources'][0], BUILD_DIR).replace('|', '\\|')[:80] - lines.append(f"| {item['count']} | `{url}` | {reason} | `{example}` |") - lines.append("") - - REPORT_PATH.parent.mkdir(parents=True, exist_ok=True) - REPORT_PATH.write_text('\n'.join(lines), encoding='utf-8') - print(f"Report written to: {REPORT_PATH}") - - -# ───────────────────────────────────────────── -# Human-readable audit report -# ───────────────────────────────────────────── - -def _source_to_page_link(path_str): - """Return a markdown link like [/docs/foo/bar](https://docs.ethswarm.org/docs/foo/bar).""" - try: - rel = Path(path_str).relative_to(DOCS_DIR) - except ValueError: - return path_str - url_path = str(rel).replace('\\', '/').replace('.mdx', '').replace('.md', '') - display = f"/docs/{url_path}" - url = f"https://{SITE_DOMAIN}/docs/{url_path}" - return f"[{display}]({url})" - - -def _fmt_sources(sources_list, max_show=2): - """Format a list of (file, text) source tuples into page link(s).""" - if not sources_list: - return "Unknown" - seen = [] - for f, _ in sources_list: - lnk = _source_to_page_link(f) - if lnk not in seen: - seen.append(lnk) - if len(seen) > max_show: - return ", ".join(seen[:max_show]) + f" _(+{len(seen)-max_show} more)_" - return ", ".join(seen) - - -def _fmt_instances(sources_list, docs_dir=None): - """ - Format a list of (file_path, link_text) tuples as bullet points separated - by
tags (for inline rendering in markdown table cells). - - Each bullet: • "link text" — `relative/file/path.md` - """ - if not sources_list: - return "_unknown_" - if docs_dir is None: - docs_dir = DOCS_DIR - bullets = [] - for f, text in sources_list: - short = make_short_path(f, docs_dir).replace('|', '\\|') - safe_text = (text or '').strip().replace('|', '\\|')[:80] - if safe_text: - bullets.append(f'• "{safe_text}" — `{short}`') - else: - bullets.append(f'• `{short}`') - return "
".join(bullets) - - -def write_human_report( - md_broken, ext_results, ext_url_to_src, - md_files_checked, md_links_checked, md_total_files, - html_broken, html_files_checked, html_links_checked, html_total_files, - staged_replacements=None, -): - import datetime - today = datetime.date.today().isoformat() - - # Categorise external results - ext_404 = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_404} - ext_down = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_DOWN} - ext_redirect = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_REDIRECT} - ext_internal = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_INTERNAL} - ext_error = {u: r for u, r in ext_results.items() if r['status'] == EXT_STATUS_ERROR} - - # Self-site 404s (docs.ethswarm.org old paths) vs truly external 404s - self_404 = {**ext_internal} # checked against local build, not found - real_404 = {**ext_404} # HTTP 404 from external server - - _staged = staged_replacements or {} - - def _repl(url, res=None): - if url in _staged: - return _staged[url] - final = (res or {}).get('final_url') or '' - return final if final and final != url else '' - - n_dead = len(md_broken) + len(self_404) + len(real_404) - n_down = len(ext_down) - n_redirects = len(ext_redirect) - n_errors = len(ext_error) - n_total = n_dead + n_down + n_redirects - - lines = [] - lines.append("## Context\n") - lines.append( - f"Dead link audit of {SITE_DOMAIN} found **{n_total}** broken, down, or stale links. " - f"Audit date: {today}.\n" - ) - - # ── Dead Links (404) ────────────────────────────────────────────────────── - lines.append("---\n") - lines.append("## Dead Links (404)\n") - - if not md_broken and not self_404 and not real_404: - lines.append("_No dead links found._\n") - else: - lines.append("| Dead Link | Status | Instances (Link Text — File) |") - lines.append("|---|---|---|") - - # Broken internal links (wrong file path or missing anchor) - for item in md_broken: - url = item['link_url'].replace('|', '\\|') - reason = item['reason'].replace('|', '\\|') - instances = _fmt_instances([(item['source'], item.get('link_text', ''))]) - lines.append(f"| `{url}` | **Broken** — {reason} | {instances} |") - - # Self-site 404s (old docs.ethswarm.org paths not in local build) - for url, _res in sorted(self_404.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - lines.append(f"| {url} | **404** — not found in local build (old path?) | {instances} |") - - # External 404s - for url, res in sorted(real_404.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - lines.append(f"| {url} | **404** | {instances} |") - - lines.append("") - - # ── Forbidden / Down ───────────────────────────────────────────────────── - lines.append("---\n") - lines.append("## Forbidden / Down\n") - - if not ext_down: - lines.append("_No unreachable links._\n") - else: - lines.append("| Dead Link | Status | Instances (Link Text — File) |") - lines.append("|---|---|---|") - for url, res in sorted(ext_down.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - err = res.get('error_msg') or 'connection failed' - # Simplify error messages - if 'DNS' in err or 'getaddrinfo' in err.lower(): - status = "**DNS failure** — domain not found" - elif 'ECONNREFUSED' in err or 'Connection refused' in err: - status = "**ECONNREFUSED** — server down" - elif 'timed out' in err.lower() or 'timeout' in err.lower(): - status = "**Timeout** — server unresponsive" - elif 'SSL' in err or 'ssl' in err: - status = "**SSL error** — handshake failure" - else: - status = f"**Down** — {err[:80]}" - lines.append(f"| {url} | {status} | {instances} |") - - lines.append("") - - # ── Stale Redirects ─────────────────────────────────────────────────────── - lines.append("---\n") - lines.append("## Stale Redirects (Should Update)\n") - - if not ext_redirect: - lines.append("_No stale redirects._\n") - else: - lines.append("| Old Link | Instances (Link Text — File) |") - lines.append("|---|---|") - for url, res in sorted(ext_redirect.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - lines.append(f"| {url} | {instances} |") - - lines.append("") - - # ── Errors / Timeouts ──────────────────────────────────────────────────── - if ext_error: - lines.append("---\n") - lines.append("## Check Errors (timeout / blocked)\n") - lines.append("_These URLs could not be verified — check manually._\n") - lines.append("| URL | Error | Instances (Link Text — File) |") - lines.append("|---|---|---|") - for url, res in sorted(ext_error.items()): - instances = _fmt_instances(ext_url_to_src.get(url, [])) - err = res.get('error_msg') or '' - lines.append(f"| {url} | {err} | {instances} |") - lines.append("") - - # ── Summary ─────────────────────────────────────────────────────────────── - lines.append("---\n") - lines.append("## Summary\n") - lines.append(f"- **Broken internal links:** {len(md_broken)}") - lines.append(f"- **Hard 404s (external):** {len(real_404) + len(self_404)}") - lines.append(f"- **Forbidden / Down:** {n_down}") - lines.append(f"- **Stale redirects:** {n_redirects}") - if ext_error: - lines.append(f"- **Check errors (unverified):** {n_errors}") - lines.append(f"- **Total actionable:** {n_total}") - lines.append("") - - # ── Priority ───────────────────────────────────────────────────────────── - lines.append("---\n") - lines.append("## Priority\n") - priority = [] - if md_broken: - priority.append(f"1. Fix {len(md_broken)} broken internal links (wrong paths / missing anchors)") - if self_404: - priority.append(f"{len(priority)+1}. Update {len(self_404)} old self-referential `{SITE_DOMAIN}` path(s) to current URLs") - if real_404: - priority.append(f"{len(priority)+1}. Remove or replace {len(real_404)} dead external link(s) (HTTP 404)") - if ext_down: - priority.append(f"{len(priority)+1}. Evaluate {len(ext_down)} down/refused server link(s) — remove or replace") - if ext_redirect: - priority.append(f"{len(priority)+1}. Update {len(ext_redirect)} stale redirect(s) to their final URL") - if ext_error: - priority.append(f"{len(priority)+1}. Manually verify {len(ext_error)} URL(s) that returned errors during check") - for item in priority: - lines.append(item) - lines.append("") - - HUMAN_REPORT_PATH.parent.mkdir(parents=True, exist_ok=True) - HUMAN_REPORT_PATH.write_text('\n'.join(lines), encoding='utf-8') - print(f"Human report written to: {HUMAN_REPORT_PATH}") - - -# ───────────────────────────────────────────── -# Staged-changes URL replacement map -# ───────────────────────────────────────────── - -def get_staged_url_replacements(): - """ - Parse `git diff --cached` to find URL replacements in staged changes. - Within each diff hunk, URLs on removed lines (-) are matched to URLs on - added lines (+) in order. Returns {old_url: new_url}. - """ - url_re = re.compile(r'https?://[^\s\])"\'<>`\\]+') - try: - result = subprocess.run( - ['git', 'diff', '--cached', '--unified=0'], - cwd=str(PROJECT_DIR), - capture_output=True, text=True, - ) - if result.returncode != 0 or not result.stdout: - return {} - except Exception: - return {} - - replacements = {} - removed, added = [], [] - - def _flush(): - removed_set = set(removed) - added_set = set(added) - gone = [u for u in removed if u not in added_set] - new = [u for u in added if u not in removed_set] - if gone and new: - for old, new_url in zip(gone, new): - replacements[old] = new_url - - for line in result.stdout.splitlines(): - if line.startswith(('diff --git', 'index ', '--- ', '+++ ')): - continue - if line.startswith('@@'): - _flush() - removed, added = [], [] - elif line.startswith('-'): - removed.extend(url_re.findall(line[1:])) - elif line.startswith('+'): - added.extend(url_re.findall(line[1:])) - _flush() - return replacements - - -# ───────────────────────────────────────────── -# Build helper -# ───────────────────────────────────────────── - -def _build_is_outdated(): - """ - Return True if any source file in docs/, static/, or key config files - was modified more recently than the build directory itself. - """ - try: - build_mtime = BUILD_DIR.stat().st_mtime - except FileNotFoundError: - return True # no build at all - - watch_dirs = [DOCS_DIR, STATIC_DIR] - watch_files = [ - PROJECT_DIR / "docusaurus.config.mjs", - PROJECT_DIR / "sidebars.js", - ] - - for d in watch_dirs: - if d.exists(): - for f in d.rglob("*"): - if f.is_file() and f.stat().st_mtime > build_mtime: - return True - - for f in watch_files: - if f.exists() and f.stat().st_mtime > build_mtime: - return True - - return False - - -def trigger_build(): - """ - Ensure a current build exists before running local checks. - - Behaviour: - • No build found → build immediately, no prompt needed. - • Build found, up to date → ask permission to overwrite. - • Build found, outdated → warn user, ask if they want to rebuild. - - Returns True if the build is ready to use, False on build failure or abort. - """ - if not BUILD_DIR.exists(): - print("\nNo existing build found — running: npm run build") - print("-" * 40) - result = subprocess.run([_NPM, 'run', 'build'], cwd=str(PROJECT_DIR)) - print("-" * 40) - if result.returncode != 0: - print("ERROR: Build failed (see output above).", file=sys.stderr) - return False - print("Build complete.\n") - return True - - # Build exists — check freshness - outdated = _build_is_outdated() - if outdated: - print(f"\nWARNING: The existing build at {BUILD_DIR} is outdated") - print(" (source files have changed since it was last built).") - prompt = "Rebuild now to get accurate results? This will overwrite it. [Y/n]: " - else: - print(f"\nAn existing build was found at: {BUILD_DIR} (appears up to date).") - prompt = "Rebuild now anyway? This will overwrite it. [y/N]: " - - try: - resp = input(prompt).strip().lower() - except (EOFError, KeyboardInterrupt): - print() - resp = '' - - # For outdated builds default is YES; for current builds default is NO - if outdated: - do_build = resp not in ('n', 'no') - else: - do_build = resp in ('y', 'yes') - - if not do_build: - if outdated: - print("Skipping rebuild — results may not reflect latest changes.\n") - else: - print("Skipping rebuild — using existing build.\n") - return True - - print("\nRunning: npm run build") - print("-" * 40) - result = subprocess.run([_NPM, 'run', 'build'], cwd=str(PROJECT_DIR)) - print("-" * 40) - if result.returncode != 0: - print("ERROR: Build failed (see output above).", file=sys.stderr) - return False - print("Build complete.\n") - return True - - -# ───────────────────────────────────────────── -# Main -# ───────────────────────────────────────────── - -def main(): - parser = argparse.ArgumentParser( - description='Bee-docs link checker', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Modes:\n" - " local — build the site locally and check source docs + build output\n" - " live — fetch the live site at docs.ethswarm.org and check all links\n" - ), - ) - parser.add_argument( - '--mode', choices=['local', 'live'], default=None, - help='Check mode: "local" (build + source check) or "live" (live site crawl). ' - 'If omitted you will be prompted.', - ) - parser.add_argument('--no-external', action='store_true', - help='(local mode only) Skip external URL checking') - parser.add_argument('--threads', type=int, default=EXT_THREADS, - help=f'Concurrent HTTP threads (default: {EXT_THREADS})') - args = parser.parse_args() - - # ── Mode selection ── - mode = args.mode - if mode is None: - print("=== Bee-docs Link Checker ===\n") - print("Which site do you want to check?") - print(" 1) local — build locally and check source docs + build output") - print(" 2) live — fetch the live site at docs.ethswarm.org\n") - try: - choice = input("Enter 1 or 2 [default: 1]: ").strip() - except (EOFError, KeyboardInterrupt): - print() - choice = '1' - mode = 'live' if choice == '2' else 'local' - print() - - # ── Live mode: delegate to check_live_links.py ── - if mode == 'live': - live_script = Path(__file__).parent / 'check_live_links.py' - if not live_script.exists(): - print(f"ERROR: {live_script} not found.", file=sys.stderr) - sys.exit(1) - cmd = [sys.executable, str(live_script), '--threads', str(args.threads)] - print(f"Running live checker: {' '.join(cmd)}\n") - result = subprocess.run(cmd) - sys.exit(result.returncode) - - # ── Local mode ── - check_ext = not args.no_external - - print("=== Bee-docs Link Checker — Local Mode ===") - print(f"Docs dir : {DOCS_DIR}") - print(f"Build dir : {BUILD_DIR}") - print(f"External : {'enabled' if check_ext else 'disabled (--no-external)'}") - print() - - if not DOCS_DIR.exists(): - print(f"ERROR: Docs dir not found: {DOCS_DIR}") - sys.exit(1) - - # Always trigger a build for local mode - if not trigger_build(): - sys.exit(1) - - print("Scanning source docs (internal links)...") - md_broken, ext_url_to_src, md_files, md_links, md_total = check_markdown_files(check_ext) - print(f" Files: {md_files}/{md_total}, Links: {md_links}, Broken internal: {len(md_broken)}") - print(f" Unique external URLs collected: {len(ext_url_to_src)}") - - ext_results = {} - if check_ext and ext_url_to_src: - print(f"\nChecking {len(ext_url_to_src)} external URLs ({args.threads} threads)...") - ext_results = check_external_urls_threaded(ext_url_to_src, threads=args.threads) - ok = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_OK) - redirects = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_REDIRECT) - not_found = sum(1 for r in ext_results.values() if r['status'] in (EXT_STATUS_404, EXT_STATUS_INTERNAL)) - down = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_DOWN) - errors = sum(1 for r in ext_results.values() if r['status'] == EXT_STATUS_ERROR) - print(f" OK: {ok} Redirect: {redirects} 404: {not_found} Down: {down} Error: {errors}") - - html_broken = [] - html_files = html_links = html_total = 0 - if BUILD_DIR.exists(): - print("\nChecking build output (HTML internal links)...") - html_broken, html_files, html_links, html_total = check_html_files() - print(f" Files: {html_files}/{html_total}, Links: {html_links}, Broken: {len(html_broken)}") - - staged = get_staged_url_replacements() - if staged: - print(f"\nFound {len(staged)} staged URL replacement(s) from git diff.") - - print("\nWriting report...") - write_report( - md_broken, ext_results, ext_url_to_src, - md_files, md_links, md_total, - html_broken, html_files, html_links, html_total, - staged_replacements=staged, - ) - write_human_report( - md_broken, ext_results, ext_url_to_src, - md_files, md_links, md_total, - html_broken, html_files, html_links, html_total, - staged_replacements=staged, - ) - - -if __name__ == '__main__': - main() diff --git a/scripts/check_live_links.py b/scripts/check_live_links.py deleted file mode 100644 index 6b975ac9..00000000 --- a/scripts/check_live_links.py +++ /dev/null @@ -1,686 +0,0 @@ -#!/usr/bin/env python3 -""" -Live site link checker for docs.ethswarm.org. - -Fetches all pages listed in the sitemap, extracts every link, -then checks each link with explicit redirect handling (no auto-following). - -Usage: - python scripts/check_live_links.py [--threads N] [--max-pages N] - npm run check:links (then select live mode) - -Output: - .claude/live_links_audit.md — human-readable report -""" - -import re -import sys -import time -import queue -import socket -import threading -import http.client -import subprocess -import xml.etree.ElementTree as ET -import argparse -import datetime -from html.parser import HTMLParser -from pathlib import Path -from urllib.parse import urlparse, urljoin, unquote -from urllib.request import Request, urlopen, HTTPRedirectHandler, build_opener -from urllib.error import URLError, HTTPError -from collections import defaultdict - -# ───────────────────────────────────────────── -# Configuration -# ───────────────────────────────────────────── - -SITE_BASE = "https://docs.ethswarm.org" -SITEMAP_URL = f"{SITE_BASE}/sitemap.xml" -PROJECT_DIR = Path(__file__).resolve().parent.parent -REPORT_PATH = PROJECT_DIR / "link-reports/live_links_audit.md" - -EXT_TIMEOUT = 15 # seconds per HTTP request -EXT_THREADS = 8 # concurrent URL checkers -EXT_DELAY = 0.05 # seconds between requests per thread - -USER_AGENT = ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/122.0 Safari/537.36 bee-docs-live-checker/1.0" -) - -IGNORE_SCHEMES = ("mailto:", "javascript:", "tel:", "ftp:", "data:", "#") -IGNORE_HOSTS = ("localhost", "127.0.0.1", "192.168.", "10.0.", "0.0.0.0") -# Hostnames that end with these suffixes are placeholder/example URLs in docs -IGNORE_HOST_SUFFIXES = (".example", ".local", ".invalid", ".test") - -# URL substrings to silently ignore — systematic redirects that aren't actionable doc fixes. -# e.g. every page has an "Edit this page" link using the old GitHub repo name. -IGNORE_URL_PATTERNS = ( - "github.com/ethersphere/docs.github.io", # "Edit this page" links using old repo name -) - -# Hostnames to ignore because they are example/template values in documentation -IGNORE_EXAMPLE_HOSTS = ( - "yourname.eth.limo", - "yourname.bzz.link", - "bee-1", # example service hostname in docker/gateway examples -) - -EXT_STATUS_OK = 'ok' -EXT_STATUS_404 = '404' -EXT_STATUS_DOWN = 'down' -EXT_STATUS_REDIRECT = 'redirect' -EXT_STATUS_ERROR = 'error' - - -# ───────────────────────────────────────────── -# HTTP helpers (explicit redirect handling) -# ───────────────────────────────────────────── - -class _NoFollowRedirectHandler(HTTPRedirectHandler): - def redirect_request(self, req, fp, code, msg, headers, newurl): - return None - - -def _build_no_redirect_opener(): - return build_opener(_NoFollowRedirectHandler()) - - -def _fetch(url, method='HEAD', follow_redirects=False, timeout=EXT_TIMEOUT): - """ - Single HTTP request. - Returns (status_code, final_url, location_header, error_str). - """ - headers = { - 'User-Agent': USER_AGENT, - 'Accept': 'text/html,application/xhtml+xml,*/*;q=0.8', - } - try: - req = Request(url, headers=headers, method=method) - if follow_redirects: - with urlopen(req, timeout=timeout) as resp: - return resp.status, resp.url, None, None - else: - opener = _build_no_redirect_opener() - with opener.open(req, timeout=timeout) as resp: - return resp.status, url, resp.headers.get('Location'), None - except HTTPError as e: - loc = e.headers.get('Location') if hasattr(e, 'headers') and e.headers else None - return e.code, url, loc, None - except (URLError, socket.timeout, socket.error, ConnectionRefusedError, - http.client.RemoteDisconnected, http.client.IncompleteRead) as e: - return None, url, None, str(e) - except Exception as e: - return None, url, None, f'{type(e).__name__}: {str(e)[:120]}' - - -def _classify_err(result, err): - if 'ECONNREFUSED' in err or 'Connection refused' in err: - result.update(status=EXT_STATUS_DOWN, error_msg='ECONNREFUSED — server down') - elif ('Name or service not known' in err or 'getaddrinfo' in err - or 'nodename' in err.lower() or 'No address' in err): - result.update(status=EXT_STATUS_DOWN, error_msg='DNS resolution failed') - elif 'timed out' in err.lower() or 'timeout' in err.lower(): - result.update(status=EXT_STATUS_DOWN, error_msg='Connection timed out') - elif 'SSL' in err or 'ssl' in err: - result.update(status=EXT_STATUS_DOWN, error_msg=f'SSL error: {err[:80]}') - else: - result.update(status=EXT_STATUS_DOWN, error_msg=f'Connection error: {err[:80]}') - return result - - -def _urls_differ(original, final): - if not final or original == final: - return False - o, f = urlparse(original), urlparse(final) - op, fp = o.path.rstrip('/'), f.path.rstrip('/') - if o.netloc == f.netloc and op == fp and o.query == f.query: - return False - if o.netloc == f.netloc and op == fp and o.scheme == 'http' and f.scheme == 'https': - return False - return True - - -def _check_dest(dest_url): - """Follow redirect destination and verify it returns 200.""" - code, final, _, err = _fetch(dest_url, method='HEAD', follow_redirects=True) - if err: - return None, dest_url, err - if code in (403, 405): - code, final, _, err = _fetch(dest_url, method='GET', follow_redirects=True) - if err: - return None, dest_url, err - return code, final or dest_url, None - - -def check_url(url): - """ - Check a single URL with explicit redirect handling. - Returns dict: {url, status, http_code, final_url, error_msg} - """ - result = dict(url=url, status=EXT_STATUS_ERROR, - http_code=None, final_url=None, error_msg=None) - - # Step 1: HEAD without following redirects - code, _, location, err = _fetch(url, method='HEAD', follow_redirects=False) - if err: - return _classify_err(result, err) - - # HEAD rejected → retry with GET - if code in (403, 405): - code, _, location, err = _fetch(url, method='GET', follow_redirects=False) - if err: - return _classify_err(result, err) - if code in (403, 405): - result.update(status=EXT_STATUS_ERROR, http_code=code, - error_msg=f'HTTP {code} (GET retry)', final_url=url) - return result - - result['http_code'] = code - - if code is None: - result['status'] = EXT_STATUS_ERROR - return result - if code == 200: - result.update(status=EXT_STATUS_OK, final_url=url) - elif code == 404: - result.update(status=EXT_STATUS_404, error_msg='HTTP 404', final_url=url) - elif code in (301, 302, 303, 307, 308): - dest = location or url - if dest and not dest.startswith('http'): - p = urlparse(url) - dest = f"{p.scheme}://{p.netloc}{dest}" - dest_code, dest_final, dest_err = _check_dest(dest) - if dest_err: - result.update(status=EXT_STATUS_DOWN, - error_msg=f"Redirect to {dest!r} failed: {dest_err[:80]}", - final_url=dest) - elif dest_code is None: - result.update(status=EXT_STATUS_DOWN, - error_msg=f"Redirect destination unreachable", - final_url=dest) - elif dest_code == 200: - if _urls_differ(url, dest_final): - result.update(status=EXT_STATUS_REDIRECT, final_url=dest_final) - else: - result.update(status=EXT_STATUS_OK, final_url=dest_final) - elif dest_code == 404: - result.update(status=EXT_STATUS_404, - error_msg=f"Redirect target returned 404", - final_url=dest) - else: - result.update(status=EXT_STATUS_ERROR, - error_msg=f"Redirect target returned HTTP {dest_code}", - final_url=dest) - elif 200 <= code < 300: - result.update(status=EXT_STATUS_OK, final_url=url) - else: - result.update(status=EXT_STATUS_ERROR, - error_msg=f'HTTP {code}', final_url=url) - - return result - - -# ───────────────────────────────────────────── -# Sitemap fetcher -# ───────────────────────────────────────────── - -def fetch_sitemap_urls(sitemap_url): - """Fetch sitemap.xml and return list of page URLs.""" - print(f"Fetching sitemap: {sitemap_url}") - try: - req = Request(sitemap_url, headers={'User-Agent': USER_AGENT}) - with urlopen(req, timeout=30) as resp: - xml_data = resp.read() - except Exception as e: - print(f"ERROR fetching sitemap: {e}", file=sys.stderr) - return [] - - urls = [] - try: - root = ET.fromstring(xml_data) - # Handle namespace - ns = '' - if root.tag.startswith('{'): - ns = root.tag.split('}')[0] + '}' - for loc in root.iter(f'{ns}loc'): - u = loc.text.strip() if loc.text else '' - if u: - urls.append(u) - except ET.ParseError as e: - print(f"ERROR parsing sitemap XML: {e}", file=sys.stderr) - - print(f" Found {len(urls)} URLs in sitemap") - return urls - - -# ───────────────────────────────────────────── -# HTML link extractor -# ───────────────────────────────────────────── - -class LinkExtractor(HTMLParser): - def __init__(self): - super().__init__() - self.links_with_text = [] # list of (href, link_text) from tags - self.text_chunks = [] # all visible text (including code blocks) for bare URL extraction - self._skip_depth = 0 # depth inside