fix(feature-ideation): address CodeRabbit re-review on PR #85 (15 fixes + 5 new tests)

github-actions[bot] · don-petry · github-actions[bot] · commit 5c52288f5bb1 · 2026-04-08T19:59:04.000Z
Critical/major:
- collect-signals.sh: validate ISSUE_LIMIT/PR_LIMIT/DISCUSSION_LIMIT are
  positive integers; tighten REPO validation with strict ^[^/]+/[^/]+$ regex
- compose-signals.sh: enforce array type (jq 'type == "array"') not just
  valid JSON so objects/strings don't silently produce wrong counts
- date-utils.sh: guard $# before reading $1 to prevent set -u abort on
  zero-arg calls
- filter-bots.sh: replace unquoted array expansion with IFS=',' read -r -a
  to prevent pathname-globbing against filesystem entries
- gh-safe.sh: bounds-check args[i+1] before --jq dereference; add $# guard
  to gh_safe_graphql_input() to prevent nounset abort
- lint-prompt.sh: recognise YAML chomping modifiers (|-,|+,&gt;-,&gt;+) in
  prompt_marker regex; replace [^}]* GH-expression stripper with a stateful
  scanner that handles nested braces; preserve exit-2 over exit-1 in main()
- match-discussions.sh: wrap json.load calls in try/except for structured
  error exit-2 instead of Python traceback; skip discussions without an id;
  switch from greedy per-proposal to similarity-sorted global optimal matching
- validate-signals.py: catch OSError on read_text() to preserve exit-2
  contract; add -&gt; bool return type annotation to _check_date_time

Docs:
- README.md: update lint command to mention both direct_prompt: and prompt:;
  fix Mary's prompt pointer to feature-ideation-reusable.yml

Tests (+5 new, 109 → 114 total):
- lint-prompt.bats: missing-file-before-lint-failing-file exits 2; YAML
  chomping modifiers detected; nested GH expressions don't false-positive
- match-discussions.bats: malformed signals JSON exits non-zero; malformed
  proposals JSON exits non-zero
- signals-schema.bats: truncated/malformed JSON exits 2 not 1
- date-utils.bats: use date_today helper instead of raw date -u
- stubs/gh: prefer TT_TMP/BATS_TEST_TMPDIR for counter file isolation

Co-authored-by: don-petry &lt;don-petry@users.noreply.github.com&gt;
diff --git a/.github/scripts/feature-ideation/README.md b/.github/scripts/feature-ideation/README.md
@@ -51,7 +51,7 @@ bats test/workflows/feature-ideation/gh-safe.bats
   shellcheck -x collect-signals.sh lint-prompt.sh match-discussions.sh \
                  discussion-mutations.sh lib/*.sh)
 
-# Lint the workflow's direct_prompt block
+# Lint workflow prompt blocks (direct_prompt: and prompt:)
 bash .github/scripts/feature-ideation/lint-prompt.sh
 ```
 
@@ -76,7 +76,7 @@ signals.json is a breaking change and must:
 
 1. Bump `SCHEMA_VERSION` in `collect-signals.sh`.
 2. Update fixtures under `test/workflows/feature-ideation/fixtures/expected/`.
-3. Update Mary's prompt in `feature-ideation.yml` if any field references move.
+3. Update Mary's prompt in `.github/workflows/feature-ideation-reusable.yml` if any field references move.
 
 CI validates every fixture against the schema, and the workflow validates
 the runtime output before handing it to Mary.
diff --git a/.github/scripts/feature-ideation/collect-signals.sh b/.github/scripts/feature-ideation/collect-signals.sh
@@ -58,13 +58,33 @@ main() {
   local discussion_limit="${DISCUSSION_LIMIT:-100}"
   local output_path="${SIGNALS_OUTPUT:-./signals.json}"
 
-  local owner repo_name
-  owner="${REPO%%/*}"
-  repo_name="${REPO##*/}"
-  if [ "$owner" = "$REPO" ] || [ -z "$repo_name" ]; then
+  # Validate that limit overrides are positive integers before forwarding to
+  # GraphQL — a value like `ISSUE_LIMIT=foo` would cause an opaque downstream
+  # failure instead of a clean usage error. Caught by CodeRabbit review on
+  # PR petry-projects/.github#85.
+  local _lim_name _lim_val
+  for _lim_name in ISSUE_LIMIT PR_LIMIT DISCUSSION_LIMIT; do
+    case "$_lim_name" in
+      ISSUE_LIMIT)      _lim_val="$issue_limit" ;;
+      PR_LIMIT)         _lim_val="$pr_limit" ;;
+      DISCUSSION_LIMIT) _lim_val="$discussion_limit" ;;
+    esac
+    if [[ ! $_lim_val =~ ^[1-9][0-9]*$ ]]; then
+      printf '[collect-signals] %s must be a positive integer, got: %s\n' "$_lim_name" "$_lim_val" >&2
+      return 64
+    fi
+  done
+
+  # Strict owner/name format — reject leading/trailing slashes, empty segments,
+  # and extra path parts (e.g. "org//repo", "/repo", "org/repo/extra").
+  # Caught by CodeRabbit review on PR petry-projects/.github#85.
+  if [[ ! $REPO =~ ^[^/]+/[^/]+$ ]]; then
     printf '[collect-signals] REPO must be in owner/name format, got: %s\n' "$REPO" >&2
     return 64
   fi
+  local owner repo_name
+  owner="${REPO%%/*}"
+  repo_name="${REPO##*/}"
 
   local thirty_days_ago
   thirty_days_ago=$(date_days_ago 30)
diff --git a/.github/scripts/feature-ideation/lib/compose-signals.sh b/.github/scripts/feature-ideation/lib/compose-signals.sh
@@ -49,8 +49,11 @@ compose_signals() {
   for input in "$open_issues" "$closed_issues" "$ideas_discussions" "$releases" \
                "$merged_prs" "$feature_requests" "$bug_reports" "$truncation_warnings"; do
     idx=$((idx + 1))
-    if ! printf '%s' "$input" | jq -e . >/dev/null 2>&1; then
-      printf '[compose-signals] arg #%d is not valid JSON: %s\n' "$idx" "${input:0:120}" >&2
+    # Require a JSON array, not just valid JSON. Objects/strings/nulls accepted
+    # by `jq -e .` would silently produce wrong counts (key count, char count).
+    # Caught by CodeRabbit review on PR petry-projects/.github#85.
+    if ! printf '%s' "$input" | jq -e 'type == "array"' >/dev/null 2>&1; then
+      printf '[compose-signals] arg #%d must be a JSON array: %s\n' "$idx" "${input:0:120}" >&2
       return 65  # EX_DATAERR
     fi
   done
diff --git a/.github/scripts/feature-ideation/lib/date-utils.sh b/.github/scripts/feature-ideation/lib/date-utils.sh
@@ -10,6 +10,13 @@ set -euo pipefail
 
 # Print an ISO date (YYYY-MM-DD) for N days ago in UTC.
 date_days_ago() {
+  # Guard arg count before reading $1: under set -u a zero-arg call would abort
+  # the shell with "unbound variable" instead of reaching the validation path.
+  # Caught by CodeRabbit review on PR petry-projects/.github#85.
+  if [ "$#" -ne 1 ]; then
+    printf '[date-utils] expected 1 arg (days), got: %d\n' "$#" >&2
+    return 64
+  fi
   local days="$1"
   if [ -z "$days" ] || ! printf '%s' "$days" | grep -Eq '^[0-9]+$'; then
     printf '[date-utils] days must be a non-negative integer, got: %s\n' "$days" >&2
diff --git a/.github/scripts/feature-ideation/lib/filter-bots.sh b/.github/scripts/feature-ideation/lib/filter-bots.sh
@@ -35,9 +35,11 @@ DEFAULT_BOT_AUTHORS=(
 filter_bots_build_list() {
   local list=("${DEFAULT_BOT_AUTHORS[@]}")
   if [ -n "${FEATURE_IDEATION_BOT_AUTHORS:-}" ]; then
-    local IFS=','
-    # shellcheck disable=SC2206
-    local extras=($FEATURE_IDEATION_BOT_AUTHORS)
+    # Use `IFS=',' read` (not unquoted expansion) to avoid pathname-globbing
+    # against the filesystem if any entry contains wildcard characters.
+    # Caught by CodeRabbit review on PR petry-projects/.github#85.
+    local extras=()
+    IFS=',' read -r -a extras <<<"${FEATURE_IDEATION_BOT_AUTHORS}"
     # Trim leading/trailing whitespace from each comma-separated entry so
     # `"bot1, bot2"` resolves to `bot1` and `bot2`, not `bot1` and ` bot2`.
     # Caught by CodeRabbit review on PR petry-projects/.github#85.
diff --git a/.github/scripts/feature-ideation/lib/gh-safe.sh b/.github/scripts/feature-ideation/lib/gh-safe.sh
@@ -93,6 +93,13 @@ gh_safe_graphql() {
   local i=0
   while [ "$i" -lt "${#args[@]}" ]; do
     if [ "${args[$i]}" = "--jq" ]; then
+      # Guard bounds before dereferencing args[i+1]: under set -u an out-of-
+      # bounds access aborts the shell. Caught by CodeRabbit review on PR
+      # petry-projects/.github#85.
+      if [ $((i + 1)) -ge "${#args[@]}" ]; then
+        _gh_safe_err "graphql-bad-args" "--jq requires a jq filter argument"
+        return 64
+      fi
       has_jq=1
       jq_filter="${args[$((i + 1))]}"
       break
@@ -187,6 +194,13 @@ gh_safe_graphql() {
 # Same defensive contract as `gh_safe_graphql`: any auth/network/schema
 # failure exits non-zero with a structured stderr message.
 gh_safe_graphql_input() {
+  # Guard arg count before reading $1: under set -u a zero-arg call aborts the
+  # shell instead of reaching the JSON validation. Caught by CodeRabbit review
+  # on PR petry-projects/.github#85.
+  if [ "$#" -ne 1 ]; then
+    _gh_safe_err "graphql-bad-input" "expected 1 arg: JSON request body, got $#"
+    return 64
+  fi
   local body="$1"
   if ! gh_safe_is_json "$body"; then
     _gh_safe_err "graphql-bad-input" "request body is not valid JSON"
diff --git a/.github/scripts/feature-ideation/lint-prompt.sh b/.github/scripts/feature-ideation/lint-prompt.sh
@@ -31,6 +31,34 @@ scan_file() {
 import re
 import sys
 
+
+def _strip_github_expressions(s: str) -> str:
+    """Remove ${{ ... }} GitHub Actions expressions from s.
+
+    Uses a stateful scanner instead of `[^}]*` regex so that expressions
+    containing `}` inside string literals (e.g. format() calls) are fully
+    consumed rather than prematurely terminated. This prevents false-positive
+    shell-expansion matches on content that is actually inside a GH expression.
+    Caught by CodeRabbit review on PR petry-projects/.github#85.
+    """
+    result: list[str] = []
+    i = 0
+    while i < len(s):
+        if s[i : i + 3] == "${{":
+            # Consume until we find the matching "}}"
+            j = i + 3
+            while j < len(s):
+                if s[j : j + 2] == "}}":
+                    j += 2
+                    break
+                j += 1
+            i = j  # skip the whole ${{ ... }} expression
+        else:
+            result.append(s[i])
+            i += 1
+    return "".join(result)
+
+
 path = sys.argv[1]
 try:
     with open(path, "r", encoding="utf-8") as f:
@@ -55,8 +83,10 @@ findings = []
 shell_expansion = re.compile(r'(?<![\\$])\$\([^)]*\)|(?<![\\$])\$\{[A-Za-z_][A-Za-z0-9_]*\}')
 
 # Recognise both `direct_prompt:` (v0) and `prompt:` (v1) markers, with
-# optional `|` or `>` block scalar indicators.
-prompt_marker = re.compile(r'(?:direct_prompt|prompt):\s*[|>]?\s*$')
+# optional `|` or `>` block scalar indicators plus YAML chomping modifiers
+# (`-` or `+`) so `prompt: |-`, `prompt: |+`, `prompt: >-`, `prompt: >+`
+# are all recognised. Caught by CodeRabbit review on PR petry-projects/.github#85.
+prompt_marker = re.compile(r'(?:direct_prompt|prompt):\s*[|>]?[-+]?\s*$')
 
 for lineno, raw in enumerate(lines, start=1):
     stripped = raw.lstrip(" ")
@@ -78,8 +108,13 @@ for lineno, raw in enumerate(lines, start=1):
             continue
 
         # We're inside the prompt body. Scan for shell expansions.
-        # First, strip out any GitHub Actions expressions so they don't trip us.
-        no_gh = re.sub(r'\$\{\{[^}]*\}\}', '', raw)
+        # First, strip out GitHub Actions ${{ ... }} expressions.
+        # The naive `[^}]*` regex stops at the first `}`, so expressions that
+        # contain `}` internally (e.g. format() calls or string literals) are
+        # not fully removed and leave false-positive shell expansion matches.
+        # Use a small stateful scanner instead.
+        # Caught by CodeRabbit review on PR petry-projects/.github#85.
+        no_gh = _strip_github_expressions(raw)
         for match in shell_expansion.finditer(no_gh):
             findings.append((lineno, match.group(0), raw.rstrip()))
 
@@ -107,15 +142,27 @@ main() {
   fi
 
   local exit=0
+  local file_rc=0
   for file in "$@"; do
     if [ ! -f "$file" ]; then
       printf '[lint-prompt] not found: %s\n' "$file" >&2
       exit=2
       continue
     fi
-    if ! scan_file "$file"; then
-      exit=1
+    # Capture the actual exit code so we preserve exit-2 (file error) over
+    # exit-1 (lint finding). A later lint failure must not overwrite an earlier
+    # file error. Caught by CodeRabbit review on PR petry-projects/.github#85.
+    if scan_file "$file"; then
+      file_rc=0
+    else
+      file_rc=$?
     fi
+    case "$file_rc" in
+      0) ;;
+      1) if [ "$exit" -eq 0 ]; then exit=1; fi ;;
+      2) exit=2 ;;
+      *) return "$file_rc" ;;
+    esac
   done
   return "$exit"
 }
diff --git a/.github/scripts/feature-ideation/match-discussions.sh b/.github/scripts/feature-ideation/match-discussions.sh
@@ -97,49 +97,91 @@ def jaccard(a: set[str], b: set[str]) -> float:
     return len(a & b) / len(a | b)
 
 
-with open(signals_path) as f:
-    signals = json.load(f)
-with open(proposals_path) as f:
-    proposals = json.load(f)
+def _load_json(path: str, label: str):
+    """Load JSON from path, exiting with code 2 on any read or parse error."""
+    try:
+        with open(path, encoding="utf-8") as f:
+            return json.load(f)
+    except OSError as exc:
+        sys.stderr.write(f"[match-discussions] cannot read {label}: {exc}\n")
+        sys.exit(2)
+    except json.JSONDecodeError as exc:
+        sys.stderr.write(f"[match-discussions] invalid JSON in {label}: {exc}\n")
+        sys.exit(2)
+
+
+signals = _load_json(signals_path, "signals")
+proposals = _load_json(proposals_path, "proposals")
 
 if not isinstance(proposals, list):
     sys.stderr.write("[match-discussions] proposals must be a JSON array\n")
     sys.exit(65)
 
 discussions = signals.get("ideas_discussions", {}).get("items", []) or []
-disc_norm = [(d, normalize(d.get("title", ""))) for d in discussions]
-
-matched = []
-new_candidates = []
-seen_disc_ids = set()
-
-for proposal in proposals:
+# Skip discussions without an id to avoid all id-less entries collapsing into
+# a single `None` key in seen_disc_ids. Caught by CodeRabbit on PR #85.
+disc_norm = [
+    (d, normalize(d.get("title", "")))
+    for d in discussions
+    if d.get("id") is not None
+]
+
+# --- Optimal (similarity-sorted) matching ------------------------------------
+# The original greedy per-proposal loop consumed discussions in proposal order,
+# so an early lower-value match could block a later higher-value match.
+# Instead we enumerate all (proposal, discussion) pairs, sort by similarity
+# descending (ties broken by original proposal index for stability), then
+# assign greedily. This guarantees globally higher-value matches are honoured
+# first. Caught by CodeRabbit review on PR petry-projects/.github#85.
+
+# Collect valid proposals with their original index (for tie-breaking + new_candidates).
+proposals_indexed: list[tuple[int, dict]] = []
+for p_idx, proposal in enumerate(proposals):
     if not isinstance(proposal, dict) or "title" not in proposal:
         sys.stderr.write(f"[match-discussions] skipping malformed proposal: {proposal!r}\n")
         continue
-    p_norm = normalize(proposal["title"])
+    proposals_indexed.append((p_idx, proposal))
 
-    best = None
-    best_sim = 0.0
+# Build all (similarity, proposal_idx, disc_id, proposal, disc) tuples.
+all_pairs: list[tuple[float, int, str, dict, dict]] = []
+for p_idx, proposal in proposals_indexed:
+    p_norm = normalize(proposal["title"])
     for disc, d_norm in disc_norm:
-        if disc.get("id") in seen_disc_ids:
-            continue
         sim = jaccard(p_norm, d_norm)
-        if sim > best_sim:
-            best_sim = sim
-            best = disc
+        all_pairs.append((sim, p_idx, disc["id"], proposal, disc))
 
-    if best is not None and best_sim >= threshold:
+# Sort descending by similarity; stable tie-break by proposal index ascending.
+all_pairs.sort(key=lambda x: (-x[0], x[1]))
+
+matched = []
+seen_disc_ids: set[str] = set()
+seen_proposal_idxs: set[int] = set()
+
+for sim, p_idx, disc_id, proposal, disc in all_pairs:
+    if p_idx in seen_proposal_idxs or disc_id in seen_disc_ids:
+        continue
+    if sim >= threshold:
         matched.append(
             {
                 "proposal": proposal,
-                "discussion": best,
-                "similarity": round(best_sim, 4),
+                "discussion": disc,
+                "similarity": round(sim, 4),
             }
         )
-        seen_disc_ids.add(best.get("id"))
-    else:
-        new_candidates.append({"proposal": proposal, "best_similarity": round(best_sim, 4)})
+        seen_disc_ids.add(disc_id)
+        seen_proposal_idxs.add(p_idx)
+
+# Unmatched proposals become new candidates.
+new_candidates = []
+for p_idx, proposal in proposals_indexed:
+    if p_idx in seen_proposal_idxs:
+        continue
+    p_norm = normalize(proposal["title"])
+    best_sim = max(
+        (jaccard(p_norm, d_norm) for _, d_norm in disc_norm),
+        default=0.0,
+    )
+    new_candidates.append({"proposal": proposal, "best_similarity": round(best_sim, 4)})
 
 result = {
     "matched": matched,
diff --git a/.github/scripts/feature-ideation/validate-signals.py b/.github/scripts/feature-ideation/validate-signals.py
@@ -53,7 +53,12 @@ def main(argv: list[str]) -> int:
         return 2
 
     try:
-        signals = json.loads(signals_path.read_text())
+        signals = json.loads(signals_path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        # File read errors (permissions, I/O) must also exit 2. Caught by
+        # CodeRabbit review on PR petry-projects/.github#85.
+        sys.stderr.write(f"[validate-signals] cannot read {signals_path}: {exc}\n")
+        return 2
     except json.JSONDecodeError as exc:
         # Per the docstring contract, exit 2 means usage / file error and
         # exit 1 means schema validation error. A malformed signals file
@@ -63,7 +68,10 @@ def main(argv: list[str]) -> int:
         return 2
 
     try:
-        schema = json.loads(schema_path.read_text())
+        schema = json.loads(schema_path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        sys.stderr.write(f"[validate-signals] cannot read schema {schema_path}: {exc}\n")
+        return 2
     except json.JSONDecodeError as exc:
         sys.stderr.write(f"[validate-signals] invalid schema JSON: {exc}\n")
         return 2
@@ -79,7 +87,7 @@ def main(argv: list[str]) -> int:
     format_checker = FormatChecker()
 
     @format_checker.checks("date-time", raises=(ValueError, TypeError))
-    def _check_date_time(instance):  # noqa: ANN001 — jsonschema callback signature
+    def _check_date_time(instance) -> bool:  # noqa: ANN001 — jsonschema callback signature
         if not isinstance(instance, str):
             return True  # non-strings handled by `type` keyword, not format
         # Must look like a date-time, not just any string. Require at least
diff --git a/test/workflows/feature-ideation/date-utils.bats b/test/workflows/feature-ideation/date-utils.bats
@@ -15,7 +15,10 @@ setup() {
 }
 
 @test "date_days_ago: 0 returns today" {
-  today=$(date -u +%Y-%m-%d)
+  # Use the helper function rather than the raw system `date` call so the test
+  # validates behaviour consistently on both GNU and BSD systems.
+  # Caught by CodeRabbit review on PR petry-projects/.github#85.
+  today=$(date_today)
   run date_days_ago 0
   [ "$status" -eq 0 ]
   [ "$output" = "$today" ]
diff --git a/test/workflows/feature-ideation/lint-prompt.bats b/test/workflows/feature-ideation/lint-prompt.bats
diff --git a/test/workflows/feature-ideation/match-discussions.bats b/test/workflows/feature-ideation/match-discussions.bats
diff --git a/test/workflows/feature-ideation/signals-schema.bats b/test/workflows/feature-ideation/signals-schema.bats
diff --git a/test/workflows/feature-ideation/stubs/gh b/test/workflows/feature-ideation/stubs/gh