-
Notifications
You must be signed in to change notification settings - Fork 17
[Build] Improve link checker #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
175b23c
first draft of markdown_link_check.py
hughperkins 5287674
recurses hierarchy
hughperkins 3368b88
migraet to deepseek written link checker
hughperkins 9a94026
yamlm...
hughperkins f7e0f5a
works on single file too
hughperkins b60e0a1
print available nachors
hughperkins 8842260
remove symbols
hughperkins 9436345
create unit tests
hughperkins 270c55e
fix all links...
hughperkins 790b095
move test_markdown_link_check into test/python
hughperkins f807ff2
update link to point to global_settings.md
hughperkins 6e18a40
Merge remote-tracking branch 'genesis/main' into hp/improve-link-checker
hughperkins 997dfa9
remove dummy pic
hughperkins 752ffc2
remove dummy pic
hughperkins 2026e8a
going high precisin
hughperkins File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,152 @@ | ||
| import re | ||
|
duburcqa marked this conversation as resolved.
|
||
| import os | ||
| import pathlib | ||
| from urllib.parse import urlparse | ||
| import argparse | ||
|
|
||
| error_found = False # Track if any errors are found | ||
|
|
||
| def check_markdown_links(file_path, base_dir=None): | ||
| """ | ||
| Check all links in a Markdown file, including anchor references. | ||
|
|
||
| Args: | ||
| file_path: Path to the Markdown file | ||
| base_dir: Base directory for relative links (defaults to file's directory) | ||
| """ | ||
| global error_found | ||
| if base_dir is None: | ||
| base_dir = os.path.dirname(os.path.abspath(file_path)) | ||
|
|
||
| with open(file_path, 'r', encoding='utf-8') as f: | ||
| content = f.read() | ||
|
|
||
| # Find all links and image references | ||
| link_pattern = r'\[.*?\]\((.*?)\)|!\[.*?\]\((.*?)\)' | ||
| matches = re.findall(link_pattern, content) | ||
|
|
||
| # Combine both capturing groups (links and images) | ||
| links = [match[0] or match[1] for match in matches if match[0] or match[1]] | ||
|
|
||
| for link in links: | ||
| parsed = urlparse(link) | ||
|
|
||
| # Skip mailto and external links | ||
| if parsed.scheme in ('http', 'https', 'mailto'): | ||
| print(f"[-] External link (not checked): {link}") | ||
| continue | ||
|
|
||
| # Handle anchor-only links | ||
| if not parsed.path and parsed.fragment: | ||
| check_anchor(file_path, parsed.fragment) | ||
| continue | ||
|
|
||
| # Handle relative paths | ||
| if not parsed.scheme and not parsed.netloc: | ||
| full_path = os.path.normpath(os.path.join(base_dir, parsed.path)) | ||
|
|
||
| # Check if file exists | ||
| if not os.path.exists(full_path): | ||
| print(f"❌ Broken link: {link} (File not found: {full_path})") | ||
| error_found = True | ||
| continue | ||
|
|
||
| # Check anchor in local file | ||
| if parsed.fragment: | ||
| if full_path.endswith('.md'): | ||
| check_anchor(full_path, parsed.fragment) | ||
| else: | ||
| # For non-markdown files, we can't check anchors | ||
| print(f"⚠️ Anchor in non-Markdown file (not checked): {link}") | ||
|
|
||
| def check_anchor(md_file_path, anchor): | ||
| """ | ||
| Check if an anchor exists in a Markdown file. | ||
|
|
||
| Args: | ||
| md_file_path: Path to the Markdown file | ||
| anchor: Anchor to check (without #) | ||
| """ | ||
| global error_found | ||
| try: | ||
| with open(md_file_path, 'r', encoding='utf-8') as f: | ||
| content = f.read() | ||
|
|
||
| # Improved anchor cleaning: remove non-alphanum except hyphens, collapse multiple hyphens, strip hyphens | ||
| def clean_anchor(s): | ||
| s = s.lower().replace(' ', '-') | ||
| s = re.sub(r'[^a-z0-9\-]', '', s) | ||
| s = re.sub(r'-+', '-', s) | ||
| s = s.strip('-') | ||
| return s | ||
|
|
||
| normalized_anchor = clean_anchor(anchor) | ||
|
|
||
| # Pattern for Markdown headers | ||
| header_pattern = r'^#+\s+(.*)$' | ||
|
|
||
| found = False | ||
| available_anchors = [] | ||
| for line in content.split('\n'): | ||
| match = re.match(header_pattern, line) | ||
| if match: | ||
| header_text = match.group(1) | ||
| anchor_dash = clean_anchor(header_text) | ||
| anchor_underscore = re.sub(r'[^a-z0-9\-]', '', header_text.lower().replace(' ', '_')) | ||
| anchor_nospace = re.sub(r'[^a-z0-9\-]', '', header_text.replace(' ', '')) | ||
| anchor_raw = re.sub(r'[^a-z0-9\-]', '', header_text) | ||
| possible_anchors = [ | ||
| anchor_dash, | ||
| anchor_underscore, | ||
| anchor_nospace, | ||
| anchor_raw | ||
| ] | ||
| available_anchors.append(anchor_dash) | ||
| if normalized_anchor in possible_anchors: | ||
| found = True | ||
| break | ||
|
|
||
| if not found: | ||
| print(f"❌ Broken anchor: #{anchor} in {md_file_path}") | ||
| print(f" Available anchors in this file:") | ||
| for a in available_anchors: | ||
| print(f" - {a}") | ||
| error_found = True | ||
| except Exception as e: | ||
| print(f"⚠️ Error checking anchor #{anchor} in {md_file_path}: {str(e)}") | ||
|
|
||
| def find_markdown_files(root_dir): | ||
| """ | ||
| Recursively find all .md files under root_dir. | ||
| """ | ||
| md_files = [] | ||
| for dirpath, _, filenames in os.walk(root_dir): | ||
| for filename in filenames: | ||
| if filename.lower().endswith('.md'): | ||
| md_files.append(os.path.join(dirpath, filename)) | ||
| return md_files | ||
|
|
||
| if __name__ == '__main__': | ||
| parser = argparse.ArgumentParser(description="Check Markdown links in a directory or a single Markdown file.") | ||
| parser.add_argument("path", help="Path to the root directory or a Markdown file") | ||
| args = parser.parse_args() | ||
|
|
||
| input_path = os.path.abspath(args.path) | ||
| md_files = [] | ||
|
|
||
| if os.path.isdir(input_path): | ||
| md_files = find_markdown_files(input_path) | ||
| if not md_files: | ||
| print(f"No Markdown files found in {input_path}") | ||
| exit(0) | ||
| elif os.path.isfile(input_path) and input_path.lower().endswith('.md'): | ||
| md_files = [input_path] | ||
| else: | ||
| print(f"Error: {input_path} is not a directory or a Markdown (.md) file.") | ||
| exit(1) | ||
|
|
||
| for md_file in md_files: | ||
| print(f"\nChecking: {md_file}") | ||
| check_markdown_links(md_file, base_dir=os.path.dirname(md_file)) | ||
| if error_found: | ||
| exit(2) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| import tempfile | ||
|
duburcqa marked this conversation as resolved.
|
||
| import os | ||
| import pytest | ||
| from tools.markdown_link_check import check_markdown_links, check_anchor, find_markdown_files | ||
|
|
||
| @pytest.fixture | ||
| def temp_dir(): | ||
| with tempfile.TemporaryDirectory() as d: | ||
| yield d | ||
|
|
||
| def write_md(base_dir, filename, content): | ||
| path = os.path.join(base_dir, filename) | ||
| os.makedirs(os.path.dirname(path), exist_ok=True) | ||
| with open(path, "w", encoding="utf-8") as f: | ||
| f.write(content) | ||
| return path | ||
|
|
||
| def test_find_markdown_files(temp_dir): | ||
| write_md(temp_dir, "a.md", "# Title") | ||
| write_md(temp_dir, "b.txt", "not markdown") | ||
| os.mkdir(os.path.join(temp_dir, "sub")) | ||
| write_md(temp_dir, "sub/c.md", "# Sub") | ||
| files = find_markdown_files(temp_dir) | ||
| assert len(files) == 2 | ||
| assert any(f.endswith("a.md") for f in files) | ||
| assert any(f.endswith("c.md") for f in files) | ||
|
|
||
| def test_check_markdown_links_valid(temp_dir, capsys): | ||
| md = "# Title\n[Link](other.md)\n" | ||
| other = "# Other" | ||
| write_md(temp_dir, "main.md", md) | ||
| write_md(temp_dir, "other.md", other) | ||
| check_markdown_links(os.path.join(temp_dir, "main.md"), temp_dir) | ||
| out = capsys.readouterr().out | ||
| assert "❌" not in out | ||
|
|
||
| def test_check_markdown_links_broken_file(temp_dir, capsys): | ||
| md = "# Title\n[Missing](missing.md)\n" | ||
| write_md(temp_dir, "main.md", md) | ||
| check_markdown_links(os.path.join(temp_dir, "main.md"), temp_dir) | ||
| out = capsys.readouterr().out | ||
| assert "❌ Broken link" in out | ||
|
|
||
| def test_check_anchor_found(temp_dir, capsys): | ||
| md = "# My Header\n" | ||
| path = write_md(temp_dir, "doc.md", md) | ||
| check_anchor(path, "my-header") | ||
| out = capsys.readouterr().out | ||
| assert "❌" not in out | ||
|
|
||
| def test_check_anchor_not_found(temp_dir, capsys): | ||
| md = "# My Header\n" | ||
| path = write_md(temp_dir, "doc.md", md) | ||
| check_anchor(path, "not-present") | ||
| out = capsys.readouterr().out | ||
| assert "❌ Broken anchor" in out | ||
|
|
||
| def test_check_anchor_symbol_removal(temp_dir, capsys): | ||
| md = "# My `Header`.\n" | ||
| path = write_md(temp_dir, "doc.md", md) | ||
| check_anchor(path, "my-header") | ||
| out = capsys.readouterr().out | ||
| assert "❌" not in out | ||
|
|
||
| def test_external_and_mailto_links(temp_dir, capsys): | ||
| md = "# Title\n[Google](https://google.com)\n[Email](mailto:test@example.com)\n" | ||
| path = write_md(temp_dir, "main.md", md) | ||
| check_markdown_links(path, temp_dir) | ||
| out = capsys.readouterr().out | ||
| assert "External link" in out | ||
|
|
||
| def test_anchor_only_link(temp_dir, capsys): | ||
| md = "# Section 1\n[Go](#section-1)\n" | ||
| path = write_md(temp_dir, "main.md", md) | ||
| check_markdown_links(path, temp_dir) | ||
| out = capsys.readouterr().out | ||
| assert "❌" not in out | ||
|
|
||
|
|
||
| def test_pr_review(temp_dir, capsys): | ||
| md = """ | ||
|
|
||
| - [PR review & merging checklist](#pr-review-merging-checklist) | ||
|
|
||
| ### PR review & merging checklist | ||
|
|
||
| Follow this checklist during PR review or merging: | ||
| """ | ||
| path = write_md(temp_dir, "main.md", md) | ||
| check_markdown_links(path, temp_dir) | ||
| out = capsys.readouterr().out | ||
| assert "❌" not in out | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[Going High Precision](...)