diff --git a/plugins/cpython-review-toolkit/agents/c-complexity-analyzer.md b/plugins/cpython-review-toolkit/agents/c-complexity-analyzer.md index d96461b..c4c0884 100644 --- a/plugins/cpython-review-toolkit/agents/c-complexity-analyzer.md +++ b/plugins/cpython-review-toolkit/agents/c-complexity-analyzer.md @@ -93,3 +93,10 @@ For each function with accidental complexity, provide: - **goto is idiomatic in C error handling**: High goto counts in CPython are normal and not a complexity concern unless the control flow is genuinely hard to follow. - **Context matters**: A complex function that is rarely modified and well-tested is lower priority than a complex function that is frequently changed. - **Suggest concrete refactorings**: Don't just say "this is complex" — show how to simplify it. + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/c-complexity-analyzer__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. diff --git a/plugins/cpython-review-toolkit/agents/error-path-analyzer.md b/plugins/cpython-review-toolkit/agents/error-path-analyzer.md index 08195f9..7742a9e 100644 --- a/plugins/cpython-review-toolkit/agents/error-path-analyzer.md +++ b/plugins/cpython-review-toolkit/agents/error-path-analyzer.md @@ -88,3 +88,39 @@ Beyond script findings, look for: - **SystemError is a real bug**: Functions that return NULL without setting an exception cause confusing errors for Python users. - **The canonical pattern is well-established**: CPython functions should use `goto error / goto done` with cleanup labels. Deviations aren't bugs but increase risk. - **Some APIs always succeed**: A few C API functions cannot fail (e.g., Py_INCREF on a known non-NULL pointer). Don't flag missing checks for these. + +## Safety Annotations + +`scan_error_paths.py` looks at C comments within +/- 5 lines of each candidate +finding. If any comment contains one of the following keywords (case-insensitive +substring match), the finding is downgraded to `confidence: low` and marked +`suppressed_by_annotation: true`. + +Suppressing keywords: + +- `safety:` / `checked:` — reviewer vouches for the call site +- `safe because` / `correct because` / `this is safe` — justification follows +- `intentional` / `by design` / `deliberately` / `expected` — pattern is chosen +- `not a bug` — known-false-positive marker +- `nolint` — general lint-suppression convention + +Example: +```c +/* checked: PyArg_ParseTuple validated above; err is 0 on this path. */ +return result; +``` + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/error-path-analyzer__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. + +## Confidence + +- **HIGH** — structurally identical to a known-bad pattern, or exact signature match; ≥90% likelihood of being a true positive. +- **MEDIUM** — similar with differences that require human verification; 70–89%. +- **LOW** — superficially similar; requires code-context reading; 50–69%. + +Findings below LOW are not reported. diff --git a/plugins/cpython-review-toolkit/agents/gil-discipline-checker.md b/plugins/cpython-review-toolkit/agents/gil-discipline-checker.md index dad892f..58e2612 100644 --- a/plugins/cpython-review-toolkit/agents/gil-discipline-checker.md +++ b/plugins/cpython-review-toolkit/agents/gil-discipline-checker.md @@ -84,3 +84,43 @@ For codebases targeting Python 3.13+: - **Blocking with GIL is a judgment call**: Short operations may not be worth the overhead of releasing/reacquiring the GIL. Flag but classify as CONSIDER. - **Free-threading is evolving**: For 3.13+ code, flag patterns that are unsafe under nogil but note that this is forward-looking guidance. - **Some modules are GIL-free by design**: Code in PC/ or Mac/ may use OS-level threading without the GIL. Understand context before flagging. + +## Safety Annotations + +`scan_gil_usage.py` looks at C comments within +/- 5 lines of each candidate +finding. If any comment contains one of the following keywords (case-insensitive +substring match), the finding is downgraded to `confidence: low` and marked +`suppressed_by_annotation: true`. + +Suppressing keywords (GIL-specific terms included): + +- `gil held` / `gil-held` — the GIL is known-held on this path +- `already locked` / `already protected` — mutex/GIL acquired by caller +- `safety:` / `checked:` — reviewer vouches for the call site +- `safe because` / `correct because` / `this is safe` — justification follows +- `intentional` / `by design` / `deliberately` / `expected` — pattern is chosen +- `not a bug` — known-false-positive marker +- `nolint` — general lint-suppression convention + +Example: +```c +Py_BEGIN_ALLOW_THREADS +/* safety: gil-held by the _internal helper; no Python API called here. */ +ret = os_level_work(); +Py_END_ALLOW_THREADS +``` + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/gil-discipline-checker__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. + +## Confidence + +- **HIGH** — structurally identical to a known-bad pattern, or exact signature match; ≥90% likelihood of being a true positive. +- **MEDIUM** — similar with differences that require human verification; 70–89%. +- **LOW** — superficially similar; requires code-context reading; 50–69%. + +Findings below LOW are not reported. diff --git a/plugins/cpython-review-toolkit/agents/git-history-analyzer.md b/plugins/cpython-review-toolkit/agents/git-history-analyzer.md index b5c8efb..f378b77 100644 --- a/plugins/cpython-review-toolkit/agents/git-history-analyzer.md +++ b/plugins/cpython-review-toolkit/agents/git-history-analyzer.md @@ -247,3 +247,11 @@ Final summary: 7. **Cap similar-bug findings at 10.** Cap risk matrix at 10 entries. Note totals if more exist. 8. **Function-level churn uses regex for C files.** The script uses regex-based function boundary detection (consistent with other cpython-review-toolkit scripts). This handles most CPython functions including multi-line signatures and Argument Clinic `_impl` functions, but may miss functions with `#ifdef` brace imbalance. + +## Confidence + +- **HIGH** — structurally identical to a known-bad pattern, or exact signature match; ≥90% likelihood of being a true positive. +- **MEDIUM** — similar with differences that require human verification; 70–89%. +- **LOW** — superficially similar; requires code-context reading; 50–69%. + +Findings below LOW are not reported. diff --git a/plugins/cpython-review-toolkit/agents/include-graph-mapper.md b/plugins/cpython-review-toolkit/agents/include-graph-mapper.md index 0e8c9ce..bbfb7cc 100644 --- a/plugins/cpython-review-toolkit/agents/include-graph-mapper.md +++ b/plugins/cpython-review-toolkit/agents/include-graph-mapper.md @@ -106,3 +106,10 @@ From the script's `include_graph` and metrics: - **Internal headers are for internal use**: Code outside Include/internal/ using internal headers is a potential API stability issue. - **Include guards vs. pragma once**: CPython uses traditional include guards. Both are fine — don't flag this as an issue. - **Count accurately**: Report exact numbers from the script, not estimates. + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/include-graph-mapper__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. diff --git a/plugins/cpython-review-toolkit/agents/null-safety-scanner.md b/plugins/cpython-review-toolkit/agents/null-safety-scanner.md index efc9551..fff9d05 100644 --- a/plugins/cpython-review-toolkit/agents/null-safety-scanner.md +++ b/plugins/cpython-review-toolkit/agents/null-safety-scanner.md @@ -72,3 +72,39 @@ Beyond script findings: - **Memory allocation can always fail**: Every malloc/PyMem_Malloc call must be checked, even if failure is unlikely. - **PyArg_ParseTuple failure leaves args uninitialized**: If the parse fails and the return isn't checked, extracted arguments are garbage. - **Some allocations are "infallible"**: PyMem_Malloc on small sizes almost never fails, but the check is still required per CPython coding standards. + +## Safety Annotations + +`scan_null_checks.py` looks at C comments within +/- 5 lines of each candidate +finding. If any comment contains one of the following keywords (case-insensitive +substring match), the finding is downgraded to `confidence: low` and marked +`suppressed_by_annotation: true`. + +Suppressing keywords: + +- `safety:` / `checked:` — reviewer vouches for the call site +- `safe because` / `correct because` / `this is safe` — justification follows +- `intentional` / `by design` / `deliberately` / `expected` — pattern is chosen +- `not a bug` — known-false-positive marker +- `nolint` — general lint-suppression convention + +Example: +```c +/* safety: ptr guaranteed non-NULL by caller contract. */ +ptr->field = value; +``` + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/null-safety-scanner__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. + +## Confidence + +- **HIGH** — structurally identical to a known-bad pattern, or exact signature match; ≥90% likelihood of being a true positive. +- **MEDIUM** — similar with differences that require human verification; 70–89%. +- **LOW** — superficially similar; requires code-context reading; 50–69%. + +Findings below LOW are not reported. diff --git a/plugins/cpython-review-toolkit/agents/pep7-style-checker.md b/plugins/cpython-review-toolkit/agents/pep7-style-checker.md index 7f2505f..3cec8d1 100644 --- a/plugins/cpython-review-toolkit/agents/pep7-style-checker.md +++ b/plugins/cpython-review-toolkit/agents/pep7-style-checker.md @@ -85,3 +85,10 @@ Provide actionable guidance: - **Generated code gets a pass**: Files like `Python/opcode_targets.h` are generated — don't flag style issues in generated files. - **Third-party code gets a pass**: Code under `Modules/_decimal/` or similar vendored directories follows their upstream style. - **Consistency matters more than perfection**: A file that consistently uses one style is better than a file with mixed styles, even if the consistent style isn't PEP 7. + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/pep7-style-checker__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. diff --git a/plugins/cpython-review-toolkit/agents/refcount-auditor.md b/plugins/cpython-review-toolkit/agents/refcount-auditor.md index d4d989f..f6a8caf 100644 --- a/plugins/cpython-review-toolkit/agents/refcount-auditor.md +++ b/plugins/cpython-review-toolkit/agents/refcount-auditor.md @@ -132,3 +132,42 @@ Beyond script findings, look for these patterns in the code: - **Context matters**: A refcount leak in a rarely-called initialization function is less critical than one in a hot loop in ceval.c. - **CPython's own patterns**: CPython code sometimes intentionally leaks references to immortal objects (None, True, False) or module-level objects that live for the process lifetime. Don't flag these. - **Be precise**: Include exact line numbers, variable names, and API calls in every finding. Vague findings are not actionable. + +## Safety Annotations + +`scan_refcounts.py` looks at C comments within +/- 5 lines of each candidate +finding. If any comment contains one of the following keywords (case-insensitive +substring match), the finding is downgraded to `confidence: low` and marked +`suppressed_by_annotation: true`. Reviewers should still eyeball these — the +annotation is a hint, not a proof. + +Suppressing keywords (add to the comment nearest the flagged line): + +- `safety:` / `checked:` — reviewer vouches for the call site +- `safe because` / `correct because` / `this is safe` — justification follows +- `intentional` / `by design` / `deliberately` / `expected` — pattern is chosen +- `not a bug` — known-false-positive marker +- `borrowed ok` — borrowed reference provably lives long enough +- `refcount safe` — refcount is accounted for elsewhere +- `nolint` — general lint-suppression convention + +Example: +```c +/* safety: PyList_GetItem returns a borrowed ref; list owned by caller. */ +PyObject *item = PyList_GetItem(list, 0); +``` + +## Running the script + +- Call the script with a Bash timeout of **300000 ms** (5 min). The default 120s kills on large repos. +- Use a **unique temp filename** for the JSON output, e.g. `/tmp/refcount-auditor__$$.json` — the `$$` PID suffix prevents collisions when multiple agents run concurrently. +- Forward `--max-files N` and (where supported) `--workers N` from the caller. +- If the script **times out or errors, do NOT retry it.** Fall back to Grep/Read for the same question. Long-running runs should use `run_in_background`. + +## Confidence + +- **HIGH** — structurally identical to a known-bad pattern, or exact signature match; ≥90% likelihood of being a true positive. +- **MEDIUM** — similar with differences that require human verification; 70–89%. +- **LOW** — superficially similar; requires code-context reading; 50–69%. + +Findings below LOW are not reported. diff --git a/plugins/cpython-review-toolkit/commands/explore.md b/plugins/cpython-review-toolkit/commands/explore.md index f82b03b..4a87fb0 100644 --- a/plugins/cpython-review-toolkit/commands/explore.md +++ b/plugins/cpython-review-toolkit/commands/explore.md @@ -137,31 +137,52 @@ After all agents complete, perform deduplication and conflict resolution, then p ## Findings by Priority -### Must Fix (FIX) -[Crash risks, memory corruption, undefined behavior] +**Use global non-restarting numbering**: number ALL findings sequentially across all sections. FIX findings first (1-N), then CONSIDER (N+1-M), then POLICY (M+1-P). Use these same numbers in the action plan. This makes it easy to reference "Finding 37" in issue trackers and emails. -### Should Consider (CONSIDER) -[Improvement opportunities with trade-offs] +### Must Fix (FIX) — N + +| # | Finding | File:Line | Agents | +|---|---------|-----------|--------| +| 1 | [Crash risk / memory corruption / UB — description] | [file:line] | [which agents found it] | + +### Should Consider (CONSIDER) — M + +| # | Finding | File:Line | +|---|---------|-----------| +| N+1 | [Improvement opportunity with trade-offs — description] | [file:line] | ### Tensions [Where agents disagree] -### Policy Decisions (POLICY) -[Team-level decisions needed] +### Policy Decisions (POLICY) — P + +| # | Finding | +|---|---------| +| M+1 | [Team-level decision — description] | + +### Acceptable (ACCEPTABLE) — Q + +| # | Finding | +|---|---------| +| P+1 | [Informational / no action required — description] | ## Strengths [What the C code does well] ## Recommended Action Plan -### Immediate -1. [FIX items — safety-critical] +Reference findings by their global number: + +### Immediate (FIX items) +1. [Fix Finding 1 — description] +2. [Fix Finding 2 — description] -### Short-term -1. [CONSIDER items — quality improvements] +### Short-term (CONSIDER items) +1. [Finding N+1 — description] +2. [Finding N+2 — description] -### Ongoing -1. [POLICY decisions to make] +### Longer-term (POLICY) +1. [Finding M+1 — description] ``` ## Usage Examples diff --git a/plugins/cpython-review-toolkit/commands/health.md b/plugins/cpython-review-toolkit/commands/health.md index b8539b5..52542e4 100644 --- a/plugins/cpython-review-toolkit/commands/health.md +++ b/plugins/cpython-review-toolkit/commands/health.md @@ -39,9 +39,14 @@ Run all agents in summary mode to produce a quick health dashboard. Each agent r ## Overall Health: X/10 ## Top 3 Priorities -1. [Most impactful improvement] -2. [Next] -3. [Next] + +Reference findings by global non-restarting numbering (FIX 1..N, CONSIDER N+1..M, +POLICY M+1..P, ACCEPTABLE P+1..Q). Use the same numbers in `/explore` output so +issues tracked here carry over to the detailed report. + +1. [Finding # — most impactful improvement] +2. [Finding # — next] +3. [Finding # — next] For detailed analysis, run: /cpython-review-toolkit:explore . [aspect] deep diff --git a/plugins/cpython-review-toolkit/scripts/analyze_includes.py b/plugins/cpython-review-toolkit/scripts/analyze_includes.py index f6d16e0..358dc1d 100644 --- a/plugins/cpython-review-toolkit/scripts/analyze_includes.py +++ b/plugins/cpython-review-toolkit/scripts/analyze_includes.py @@ -9,9 +9,10 @@ - api_tiers: classification of headers into public/cpython/internal Usage: - python analyze_includes.py [path] + python analyze_includes.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json diff --git a/plugins/cpython-review-toolkit/scripts/check_pep7.py b/plugins/cpython-review-toolkit/scripts/check_pep7.py index f5f4ed7..d79039f 100644 --- a/plugins/cpython-review-toolkit/scripts/check_pep7.py +++ b/plugins/cpython-review-toolkit/scripts/check_pep7.py @@ -8,9 +8,10 @@ - trailing whitespace, missing braces Usage: - python check_pep7.py [path] + python check_pep7.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json diff --git a/plugins/cpython-review-toolkit/scripts/measure_c_complexity.py b/plugins/cpython-review-toolkit/scripts/measure_c_complexity.py index f99385b..404f989 100644 --- a/plugins/cpython-review-toolkit/scripts/measure_c_complexity.py +++ b/plugins/cpython-review-toolkit/scripts/measure_c_complexity.py @@ -7,9 +7,10 @@ - weighted score (1-10) Usage: - python measure_c_complexity.py [path] + python measure_c_complexity.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json diff --git a/plugins/cpython-review-toolkit/scripts/scan_common.py b/plugins/cpython-review-toolkit/scripts/scan_common.py new file mode 100644 index 0000000..e1b9d36 --- /dev/null +++ b/plugins/cpython-review-toolkit/scripts/scan_common.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Shared utilities for cpython-review-toolkit scripts. Stdlib-only.""" +from __future__ import annotations + +import re +from pathlib import Path +from typing import Iterable + + +# Matches /* ... */ block comments (possibly multi-line) and // line comments. +_BLOCK_COMMENT_RE = re.compile(r"/\*.*?\*/", re.DOTALL) +_LINE_COMMENT_RE = re.compile(r"//[^\n]*") + + +_SAFETY_KEYWORDS = frozenset({ + "safety:", "safe because", "intentional", "by design", "nolint", + "checked:", "correct because", "this is safe", "not a bug", + "deliberately", "expected", "gil held", "gil-held", "already locked", + "already protected", "refcount safe", "borrowed ok", +}) + + +def extract_nearby_comments(source: str, line: int, radius: int = 5) -> list[str]: + """Return C-style comments within +/- `radius` lines of 1-based `line`. + + Scans /* ... */ and // ... comments. Each returned item is the comment + text with the markers stripped. Handles multi-line block comments. + """ + lines = source.splitlines() + if not lines: + return [] + min_line = max(1, line - radius) + max_line = min(len(lines), line + radius) + # Compute byte offsets for each line start. + offsets = [0] + for ln in lines: + offsets.append(offsets[-1] + len(ln) + 1) # +1 for \n + + def line_of(offset: int) -> int: + # Binary search. + lo, hi = 0, len(offsets) - 1 + while lo < hi: + mid = (lo + hi + 1) // 2 + if offsets[mid] <= offset: + lo = mid + else: + hi = mid - 1 + return lo + 1 # 1-based + + comments: list[str] = [] + for m in _BLOCK_COMMENT_RE.finditer(source): + start_line = line_of(m.start()) + end_line = line_of(m.end() - 1) + if start_line <= max_line and end_line >= min_line: + text = m.group(0)[2:-2].strip() + comments.append(text) + for m in _LINE_COMMENT_RE.finditer(source): + ln = line_of(m.start()) + if min_line <= ln <= max_line: + comments.append(m.group(0)[2:].strip()) + return comments + + +def has_safety_annotation(comments: Iterable[str]) -> bool: + """True if any comment contains a safety-annotation keyword.""" + for c in comments: + lower = c.lower() + if any(kw in lower for kw in _SAFETY_KEYWORDS): + return True + return False + + +def make_finding( + finding_type: str, + *, + file: str = "", + line: int = 0, + function: str = "", + classification: str, + severity: str, + confidence: str = "high", + detail: str, + **extra, +) -> dict: + """Create a finding dict with a consistent shape.""" + finding: dict = { + "type": finding_type, + "file": file, + "line": line, + "function": function, + "classification": classification, + "severity": severity, + "confidence": confidence, + "detail": detail, + } + finding.update(extra) + return finding + + +def load_json_data(path: Path) -> dict: + """Load a JSON data file. Emit stderr warning on failure; return {}. + + Unlike a bare `json.load`, this never silently returns empty: callers + can still proceed (returning 0 findings) but the warning flags the + degraded state to the user. + """ + import json + import sys + try: + with open(path, encoding="utf-8") as f: + return json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f"WARNING: Failed to load {path}: {e}", file=sys.stderr) + return {} diff --git a/plugins/cpython-review-toolkit/scripts/scan_error_paths.py b/plugins/cpython-review-toolkit/scripts/scan_error_paths.py index b333e2f..cf8eaed 100644 --- a/plugins/cpython-review-toolkit/scripts/scan_error_paths.py +++ b/plugins/cpython-review-toolkit/scripts/scan_error_paths.py @@ -5,9 +5,10 @@ PyErr_Set*, error path cleanup issues, and inconsistent error returns. Usage: - python scan_error_paths.py [path] + python scan_error_paths.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json @@ -16,6 +17,10 @@ from pathlib import Path from typing import Generator +# Allow importing sibling scan_common. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from scan_common import extract_nearby_comments, has_safety_annotation # noqa: E402 + # --------------------------------------------------------------------------- # Shared utilities @@ -346,6 +351,12 @@ def analyze(target: str, *, max_files: int = 0) -> dict: finding["file"] = rel finding["function"] = func["name"] finding["line"] = func["start_line"] + finding.pop("line_offset") + # Annotation-aware suppression: downgrade to low confidence + # when a nearby safety comment is present. + nearby = extract_nearby_comments(source, finding["line"]) + if has_safety_annotation(nearby): + finding["confidence"] = "low" + finding["suppressed_by_annotation"] = True all_findings.append(finding) # Categorize. diff --git a/plugins/cpython-review-toolkit/scripts/scan_gil_usage.py b/plugins/cpython-review-toolkit/scripts/scan_gil_usage.py index ec06822..e79b198 100644 --- a/plugins/cpython-review-toolkit/scripts/scan_gil_usage.py +++ b/plugins/cpython-review-toolkit/scripts/scan_gil_usage.py @@ -5,9 +5,10 @@ the GIL, blocking calls with the GIL held, and PyGILState balance. Usage: - python scan_gil_usage.py [path] + python scan_gil_usage.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json @@ -16,6 +17,10 @@ from pathlib import Path from typing import Generator +# Allow importing sibling scan_common. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from scan_common import extract_nearby_comments, has_safety_annotation # noqa: E402 + # --------------------------------------------------------------------------- # Shared utilities @@ -296,6 +301,12 @@ def analyze(target: str, *, max_files: int = 0) -> dict: finding["file"] = rel finding["function"] = func["name"] finding["line"] = func["start_line"] + finding.pop("line_offset") + # Annotation-aware suppression: downgrade to low confidence + # when a nearby safety comment is present. + nearby = extract_nearby_comments(source, finding["line"]) + if has_safety_annotation(nearby): + finding["confidence"] = "low" + finding["suppressed_by_annotation"] = True all_findings.append(finding) mismatched = [f for f in all_findings if "mismatched" in f["type"]] diff --git a/plugins/cpython-review-toolkit/scripts/scan_null_checks.py b/plugins/cpython-review-toolkit/scripts/scan_null_checks.py index 67b1e70..eea923e 100644 --- a/plugins/cpython-review-toolkit/scripts/scan_null_checks.py +++ b/plugins/cpython-review-toolkit/scripts/scan_null_checks.py @@ -5,9 +5,10 @@ and PyArg_Parse* issues. Usage: - python scan_null_checks.py [path] + python scan_null_checks.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json @@ -16,6 +17,10 @@ from pathlib import Path from typing import Generator +# Allow importing sibling scan_common. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from scan_common import extract_nearby_comments, has_safety_annotation # noqa: E402 + # --------------------------------------------------------------------------- # Shared utilities @@ -273,6 +278,12 @@ def analyze(target: str, *, max_files: int = 0) -> dict: finding["file"] = rel finding["function"] = func["name"] finding["line"] = func["start_line"] + finding.pop("line_offset") + # Annotation-aware suppression: downgrade to low confidence + # when a nearby safety comment is present. + nearby = extract_nearby_comments(source, finding["line"]) + if has_safety_annotation(nearby): + finding["confidence"] = "low" + finding["suppressed_by_annotation"] = True all_findings.append(finding) unchecked = [f for f in all_findings if f["type"] == "unchecked_alloc"] diff --git a/plugins/cpython-review-toolkit/scripts/scan_refcounts.py b/plugins/cpython-review-toolkit/scripts/scan_refcounts.py index dad2cee..4d010d3 100644 --- a/plugins/cpython-review-toolkit/scripts/scan_refcounts.py +++ b/plugins/cpython-review-toolkit/scripts/scan_refcounts.py @@ -7,9 +7,10 @@ Outputs a JSON structure with per-function refcount balance analysis. Usage: - python scan_refcounts.py [path] + python scan_refcounts.py [path] [--max-files N] path: directory, file, or omitted for current directory + --max-files N: cap the number of .c/.h files scanned (0 = unlimited) """ import json @@ -18,6 +19,10 @@ from pathlib import Path from typing import Generator +# Allow importing sibling scan_common. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from scan_common import extract_nearby_comments, has_safety_annotation # noqa: E402 + # --------------------------------------------------------------------------- # Shared utilities @@ -699,6 +704,12 @@ def analyze(target: str, *, max_files: int = 0) -> dict: finding["file"] = rel finding["function"] = func["name"] finding["line"] = func["start_line"] + finding.pop("line_offset") + # Annotation-aware suppression: downgrade to low confidence + # when a nearby safety comment is present. + nearby = extract_nearby_comments(source, finding["line"]) + if has_safety_annotation(nearby): + finding["confidence"] = "low" + finding["suppressed_by_annotation"] = True all_findings.append(finding) # Categorize findings. diff --git a/tests/test_scan_common.py b/tests/test_scan_common.py new file mode 100644 index 0000000..7da2483 --- /dev/null +++ b/tests/test_scan_common.py @@ -0,0 +1,166 @@ +"""Tests for scan_common helpers (stdlib-only).""" + +import io +import json +import sys +import tempfile +import unittest +from contextlib import redirect_stderr +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from helpers import import_script # noqa: E402 + + +class TestExtractNearbyComments(unittest.TestCase): + def setUp(self): + self.mod = import_script("scan_common") + + def test_no_comments(self): + src = "int x = 1;\nint y = 2;\nint z = 3;\n" + self.assertEqual(self.mod.extract_nearby_comments(src, 2), []) + + def test_line_comment_within_radius(self): + src = "int x = 1;\n// a note here\nint z = 3;\n" + comments = self.mod.extract_nearby_comments(src, 2, radius=1) + self.assertIn("a note here", comments) + + def test_block_comment_within_radius(self): + src = "int x = 1;\n/* block comment */\nint z = 3;\n" + comments = self.mod.extract_nearby_comments(src, 2, radius=1) + self.assertTrue(any("block comment" in c for c in comments)) + + def test_multiline_block_comment(self): + src = ( + "int x = 1;\n" + "/* line 2 of comment\n" + " * line 3 of comment\n" + " * line 4 of comment */\n" + "int y = 5;\n" + "int z = 6;\n" + ) + # Finding on line 5 should see the multi-line block comment. + comments = self.mod.extract_nearby_comments(src, 5, radius=1) + self.assertTrue(any("line 2 of comment" in c for c in comments)) + + def test_comment_outside_radius(self): + src = "// far away\n" + "int x = 0;\n" * 20 + # Line 15 is far from the comment on line 1 with default radius. + comments = self.mod.extract_nearby_comments(src, 15, radius=5) + self.assertEqual(comments, []) + + def test_empty_source(self): + self.assertEqual(self.mod.extract_nearby_comments("", 1), []) + + +class TestHasSafetyAnnotation(unittest.TestCase): + def setUp(self): + self.mod = import_script("scan_common") + + def test_positive_safety_colon(self): + self.assertTrue( + self.mod.has_safety_annotation(["safety: reviewed by bob"]) + ) + + def test_positive_intentional(self): + self.assertTrue( + self.mod.has_safety_annotation(["this is intentional behavior"]) + ) + + def test_positive_case_insensitive(self): + self.assertTrue( + self.mod.has_safety_annotation(["This Is Safe to ignore"]) + ) + + def test_positive_gil_held(self): + self.assertTrue( + self.mod.has_safety_annotation(["gil-held by caller"]) + ) + + def test_negative_no_keyword(self): + self.assertFalse( + self.mod.has_safety_annotation(["just a regular comment"]) + ) + + def test_negative_empty(self): + self.assertFalse(self.mod.has_safety_annotation([])) + + +class TestMakeFinding(unittest.TestCase): + def setUp(self): + self.mod = import_script("scan_common") + + def test_required_keys_present(self): + f = self.mod.make_finding( + "my_type", + classification="FIX", + severity="high", + detail="boom", + ) + for k in ( + "type", "file", "line", "function", + "classification", "severity", "confidence", "detail", + ): + self.assertIn(k, f) + self.assertEqual(f["type"], "my_type") + self.assertEqual(f["confidence"], "high") # default + + def test_extra_kwargs_merged(self): + f = self.mod.make_finding( + "t", + classification="CONSIDER", + severity="low", + detail="d", + api_call="PyList_New", + variable="obj", + ) + self.assertEqual(f["api_call"], "PyList_New") + self.assertEqual(f["variable"], "obj") + + +class TestLoadJsonData(unittest.TestCase): + def setUp(self): + self.mod = import_script("scan_common") + + def test_missing_file_warns_and_returns_empty(self): + missing = Path(tempfile.gettempdir()) / "cpyrt_does_not_exist_xyz.json" + if missing.exists(): + missing.unlink() + buf = io.StringIO() + with redirect_stderr(buf): + result = self.mod.load_json_data(missing) + self.assertEqual(result, {}) + self.assertIn("WARNING", buf.getvalue()) + self.assertIn("Failed to load", buf.getvalue()) + + def test_invalid_json_warns_and_returns_empty(self): + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False, encoding="utf-8", + ) as f: + f.write("not { valid json") + path = Path(f.name) + try: + buf = io.StringIO() + with redirect_stderr(buf): + result = self.mod.load_json_data(path) + self.assertEqual(result, {}) + self.assertIn("WARNING", buf.getvalue()) + finally: + path.unlink() + + def test_valid_json_round_trip(self): + data = {"foo": "bar", "n": 3} + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False, encoding="utf-8", + ) as f: + json.dump(data, f) + path = Path(f.name) + try: + result = self.mod.load_json_data(path) + self.assertEqual(result, data) + finally: + path.unlink() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_scan_error_paths.py b/tests/test_scan_error_paths.py index 52c3c36..dd2c016 100644 --- a/tests/test_scan_error_paths.py +++ b/tests/test_scan_error_paths.py @@ -25,6 +25,9 @@ def test_detects_missing_null_check(self): ) with TempProject({"Objects/test.c": c_code}) as root: result = mod.analyze(str(root)) + # Envelope sanity: silent-failure guard. + self.assertGreater(result["files_analyzed"], 0) + self.assertGreater(result["functions_analyzed"], 0) findings = result["findings"] unchecked = [ f for f in findings @@ -32,6 +35,27 @@ def test_detects_missing_null_check(self): ] self.assertGreater(len(unchecked), 0) + def test_safety_annotation_downgrades_finding(self): + c_code = ( + "static PyObject *\n" + "no_check(PyObject *self, PyObject *args)\n" + "{\n" + " /* safety: result is checked by Str() before deref */\n" + " PyObject *result = PyObject_GetAttrString(self, \"name\");\n" + " PyObject *str = PyObject_Str(result);\n" + " return str;\n" + "}\n" + ) + with TempProject({"Objects/test.c": c_code}) as root: + result = mod.analyze(str(root)) + relevant = [ + f for f in result["findings"] + if f["type"] in ("missing_null_check", "unchecked_return") + ] + for f in relevant: + self.assertEqual(f.get("confidence"), "low") + self.assertTrue(f.get("suppressed_by_annotation")) + def test_clean_error_handling(self): c_code = ( "static PyObject *\n" diff --git a/tests/test_scan_gil_usage.py b/tests/test_scan_gil_usage.py index d1185b3..af263a2 100644 --- a/tests/test_scan_gil_usage.py +++ b/tests/test_scan_gil_usage.py @@ -25,12 +25,37 @@ def test_detects_mismatched_threads(self): ) with TempProject({"Modules/test.c": c_code}) as root: result = mod.analyze(str(root)) + # Envelope sanity: silent-failure guard. + self.assertGreater(result["files_analyzed"], 0) + self.assertGreater(result["functions_analyzed"], 0) mismatched = [ f for f in result["findings"] if f["type"] == "mismatched_allow_threads" ] self.assertGreater(len(mismatched), 0) + def test_safety_annotation_downgrades_finding(self): + c_code = ( + "static int\n" + "api_no_gil(PyObject *self)\n" + "{\n" + " Py_BEGIN_ALLOW_THREADS\n" + " /* safety: gil-held by callee via internal mutex */\n" + " PyObject_CallMethod(self, \"method\", NULL);\n" + " Py_END_ALLOW_THREADS\n" + " return 0;\n" + "}\n" + ) + with TempProject({"Modules/test.c": c_code}) as root: + result = mod.analyze(str(root)) + api_findings = [ + f for f in result["findings"] + if f["type"] == "api_without_gil" + ] + for f in api_findings: + self.assertEqual(f.get("confidence"), "low") + self.assertTrue(f.get("suppressed_by_annotation")) + def test_balanced_threads_no_finding(self): c_code = ( "static int\n" diff --git a/tests/test_scan_null_checks.py b/tests/test_scan_null_checks.py index 51d8e7b..094cb58 100644 --- a/tests/test_scan_null_checks.py +++ b/tests/test_scan_null_checks.py @@ -26,12 +26,39 @@ def test_detects_unchecked_malloc(self): ) with TempProject({"Objects/test.c": c_code}) as root: result = mod.analyze(str(root)) + # Envelope sanity: silent-failure guard. + self.assertGreater(result["files_analyzed"], 0) + self.assertGreater(result["functions_analyzed"], 0) findings = result["findings"] unchecked = [ f for f in findings if f["type"] == "unchecked_alloc" ] self.assertGreater(len(unchecked), 0) + def test_safety_annotation_downgrades_finding(self): + # Same flaw as test_detects_unchecked_malloc, but annotated. + c_code = ( + "static int\n" + "bad_alloc(int n)\n" + "{\n" + " /* safety: caller guarantees non-NULL buf via preallocation */\n" + " char *buf = PyMem_Malloc(n);\n" + " buf->data = 0;\n" + " PyMem_Free(buf);\n" + " return 0;\n" + "}\n" + ) + with TempProject({"Objects/test.c": c_code}) as root: + result = mod.analyze(str(root)) + unchecked = [ + f for f in result["findings"] + if f["type"] == "unchecked_alloc" + ] + # Any finding that remains must be downgraded. + for f in unchecked: + self.assertEqual(f.get("confidence"), "low") + self.assertTrue(f.get("suppressed_by_annotation")) + def test_checked_malloc_no_finding(self): c_code = ( "static int\n" diff --git a/tests/test_scan_refcounts.py b/tests/test_scan_refcounts.py index ae3ccda..5fbbf31 100644 --- a/tests/test_scan_refcounts.py +++ b/tests/test_scan_refcounts.py @@ -40,6 +40,34 @@ def test_detects_leaked_reference(self): self.assertGreaterEqual(len(leaks), 0) # The function should be analyzed. self.assertGreater(result["functions_analyzed"], 0) + # Envelope sanity: silent-failure guard. + self.assertGreater(result["files_analyzed"], 0) + + def test_safety_annotation_downgrades_finding(self): + # Allocates a new ref that is neither DECREF'd, stolen, nor + # returned — a canonical potential_leak — but marked as safe. + c_code = ( + "static void\n" + "leaky_intentional(void)\n" + "{\n" + " /* intentional: cache lives for process lifetime */\n" + " PyObject *cache = PyList_New(0);\n" + " (void)cache;\n" + "}\n" + ) + with TempProject({"Objects/test.c": c_code}) as root: + result = mod.analyze(str(root)) + # Envelope sanity. + self.assertGreater(result["files_analyzed"], 0) + self.assertGreater(result["functions_analyzed"], 0) + leaks = [ + f for f in result["findings"] + if f["type"] == "potential_leak" + and f.get("variable") == "cache" + ] + for f in leaks: + self.assertEqual(f.get("confidence"), "low") + self.assertTrue(f.get("suppressed_by_annotation")) def test_clean_function_no_findings(self): c_code = (