@@ -97,49 +97,91 @@ def jaccard(a: set[str], b: set[str]) -> float:
9797 return len(a & b) / len(a | b)
9898
9999
100- with open(signals_path) as f:
101- signals = json.load(f)
102- with open(proposals_path) as f:
103- proposals = json.load(f)
100+ def _load_json(path: str, label: str):
101+ """Load JSON from path, exiting with code 2 on any read or parse error."""
102+ try:
103+ with open(path, encoding="utf-8") as f:
104+ return json.load(f)
105+ except OSError as exc:
106+ sys.stderr.write(f"[match-discussions] cannot read {label}: {exc}\n")
107+ sys.exit(2)
108+ except json.JSONDecodeError as exc:
109+ sys.stderr.write(f"[match-discussions] invalid JSON in {label}: {exc}\n")
110+ sys.exit(2)
111+
112+
113+ signals = _load_json(signals_path, "signals")
114+ proposals = _load_json(proposals_path, "proposals")
104115
105116if not isinstance(proposals, list):
106117 sys.stderr.write("[match-discussions] proposals must be a JSON array\n")
107118 sys.exit(65)
108119
109120discussions = signals.get("ideas_discussions", {}).get("items", []) or []
110- disc_norm = [(d, normalize(d.get("title", ""))) for d in discussions]
111-
112- matched = []
113- new_candidates = []
114- seen_disc_ids = set()
115-
116- for proposal in proposals:
121+ # Skip discussions without an id to avoid all id-less entries collapsing into
122+ # a single `None` key in seen_disc_ids. Caught by CodeRabbit on PR #85.
123+ disc_norm = [
124+ (d, normalize(d.get("title", "")))
125+ for d in discussions
126+ if d.get("id") is not None
127+ ]
128+
129+ # --- Optimal (similarity-sorted) matching ------------------------------------
130+ # The original greedy per-proposal loop consumed discussions in proposal order,
131+ # so an early lower-value match could block a later higher-value match.
132+ # Instead we enumerate all (proposal, discussion) pairs, sort by similarity
133+ # descending (ties broken by original proposal index for stability), then
134+ # assign greedily. This guarantees globally higher-value matches are honoured
135+ # first. Caught by CodeRabbit review on PR petry-projects/.github#85.
136+
137+ # Collect valid proposals with their original index (for tie-breaking + new_candidates).
138+ proposals_indexed: list[tuple[int, dict]] = []
139+ for p_idx, proposal in enumerate(proposals):
117140 if not isinstance(proposal, dict) or "title" not in proposal:
118141 sys.stderr.write(f"[match-discussions] skipping malformed proposal: {proposal!r}\n")
119142 continue
120- p_norm = normalize( proposal["title"] )
143+ proposals_indexed.append((p_idx, proposal) )
121144
122- best = None
123- best_sim = 0.0
145+ # Build all (similarity, proposal_idx, disc_id, proposal, disc) tuples.
146+ all_pairs: list[tuple[float, int, str, dict, dict]] = []
147+ for p_idx, proposal in proposals_indexed:
148+ p_norm = normalize(proposal["title"])
124149 for disc, d_norm in disc_norm:
125- if disc.get("id") in seen_disc_ids:
126- continue
127150 sim = jaccard(p_norm, d_norm)
128- if sim > best_sim:
129- best_sim = sim
130- best = disc
151+ all_pairs.append((sim, p_idx, disc["id"], proposal, disc))
131152
132- if best is not None and best_sim >= threshold:
153+ # Sort descending by similarity; stable tie-break by proposal index ascending.
154+ all_pairs.sort(key=lambda x: (-x[0], x[1]))
155+
156+ matched = []
157+ seen_disc_ids: set[str] = set()
158+ seen_proposal_idxs: set[int] = set()
159+
160+ for sim, p_idx, disc_id, proposal, disc in all_pairs:
161+ if p_idx in seen_proposal_idxs or disc_id in seen_disc_ids:
162+ continue
163+ if sim >= threshold:
133164 matched.append(
134165 {
135166 "proposal": proposal,
136- "discussion": best ,
137- "similarity": round(best_sim , 4),
167+ "discussion": disc ,
168+ "similarity": round(sim , 4),
138169 }
139170 )
140- seen_disc_ids.add(best.get("id"))
141- else:
142- new_candidates.append({"proposal": proposal, "best_similarity": round(best_sim, 4)})
171+ seen_disc_ids.add(disc_id)
172+ seen_proposal_idxs.add(p_idx)
173+
174+ # Unmatched proposals become new candidates.
175+ new_candidates = []
176+ for p_idx, proposal in proposals_indexed:
177+ if p_idx in seen_proposal_idxs:
178+ continue
179+ p_norm = normalize(proposal["title"])
180+ best_sim = max(
181+ (jaccard(p_norm, d_norm) for _, d_norm in disc_norm),
182+ default=0.0,
183+ )
184+ new_candidates.append({"proposal": proposal, "best_similarity": round(best_sim, 4)})
143185
144186result = {
145187 "matched": matched,
0 commit comments