check_markdown_links.py 2.49 KB
#!/usr/bin/env /usr/local/miniconda3/bin/python
from __future__ import annotations

import argparse
import fnmatch
import re
import sys
from pathlib import Path

LINK_RE = re.compile(r'!?(?:\[([^\]]*)\])\(([^)]+)\)')
SKIP_PREFIXES = ('http://', 'https://', 'mailto:', 'tel:', '#')
DEFAULT_EXCLUDES = ['CHANGELOG.md']


def should_check(target: str) -> bool:
    target = target.strip()
    return bool(target) and not target.startswith(SKIP_PREFIXES)


def normalize_target(raw: str) -> str:
    target = raw.strip()
    if target.startswith('<') and target.endswith('>'):
        target = target[1:-1]
    target = target.split('#', 1)[0].split('?', 1)[0].strip()
    return target


def iter_markdown_files(root: Path, excludes: list[str]) -> list[Path]:
    files: list[Path] = []
    for path in sorted(root.rglob('*.md')):
        rel = path.relative_to(root).as_posix()
        if any(fnmatch.fnmatch(rel, pattern) for pattern in excludes):
            continue
        files.append(path)
    return files


def scan_markdown_file(path: Path, root: Path) -> list[tuple[str, str]]:
    missing: list[tuple[str, str]] = []
    text = path.read_text(encoding='utf-8')
    for _, raw_target in LINK_RE.findall(text):
        if not should_check(raw_target):
            continue
        target = normalize_target(raw_target)
        if not target:
            continue
        resolved = (path.parent / target).resolve()
        if not resolved.exists():
            missing.append((path.relative_to(root).as_posix(), raw_target))
    return missing


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Check relative Markdown links for missing files.')
    parser.add_argument('--root', default='docs', help='Root directory containing markdown files')
    parser.add_argument('--exclude', action='append', default=[], help='Glob patterns relative to root to exclude')
    args = parser.parse_args()

    root = Path(args.root).resolve()
    if not root.exists():
        print(f'root not found: {root}', file=sys.stderr)
        sys.exit(2)

    excludes = DEFAULT_EXCLUDES + list(args.exclude)
    files = iter_markdown_files(root, excludes)
    failures: list[tuple[str, str]] = []
    for md in files:
        failures.extend(scan_markdown_file(md, root))

    if failures:
        print('Missing relative markdown targets:')
        for source, target in failures:
            print(f'- {source}: {target}')
        sys.exit(1)

    print(f'OK: checked {len(files)} markdown files under {root} (excluded: {excludes})')