From 3244475e6f3e84340212861f8270ca92db10de07 Mon Sep 17 00:00:00 2001 From: Peter Corbett Date: Sun, 19 Apr 2026 10:40:20 -0700 Subject: [PATCH 1/3] feat: add v2 source-file analysis upgrades --- README.md | 16 +- docs/superpowers/handover-2026-04-17.md | 9 +- src/opendna/analyzer.py | 104 ++++- src/opendna/annotations/clinvar.json | 12 + src/opendna/annotations/pharmgkb.json | 12 + src/opendna/cli.py | 13 +- src/opendna/llm/base.py | 5 +- src/opendna/models.py | 86 ++++ src/opendna/panels/cardiovascular.json | 24 +- src/opendna/panels/pharmacogenomics.json | 11 + src/opendna/parser.py | 238 ++++++++++- src/opendna/report/render.py | 20 +- src/opendna/report/template.html.j2 | 298 ++++++++++--- src/opendna/server.py | 37 +- src/opendna/summaries.py | 517 +++++++++++++++++++++++ tests/test_analyzer.py | 14 + tests/test_cli.py | 2 + tests/test_e2e.py | 6 + tests/test_panels.py | 9 + tests/test_parser.py | 31 +- tests/test_report.py | 18 + tests/test_summaries.py | 53 +++ 22 files changed, 1432 insertions(+), 103 deletions(-) create mode 100644 src/opendna/summaries.py create mode 100644 tests/test_summaries.py diff --git a/README.md b/README.md index 1d3d1a2..fadeea9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/) -**OpenDNA** parses a 23andMe / AncestryDNA / MyHeritage raw DNA file against 8 curated SNP panels (cardiovascular, methylation, pharmacogenomics, athletic, dietary, HFE, cognition, stimulant sensitivity), joins findings with ClinVar + PharmGKB/CPIC annotations, and renders a self-contained HTML report. Optional BYOK LLM synthesis (Anthropic Claude or OpenAI GPT) adds a prose interpretation layer. +**OpenDNA** parses a 23andMe / AncestryDNA / MyHeritage raw DNA file against 8 curated SNP panels (cardiovascular, methylation, pharmacogenomics, athletic, dietary, HFE, cognition, stimulant sensitivity), joins findings with ClinVar + PharmGKB/CPIC annotations, and renders a self-contained HTML report. It now also scores match confidence, shows panel coverage / blind spots, and rolls multi-marker genes like APOE, HFE, MTHFR, CYP2C19, and warfarin PGx into composite calls. Optional BYOK LLM synthesis (Anthropic Claude or OpenAI GPT) adds a prose interpretation layer. **What leaves your machine:** nothing, by default. No account, no upload, no telemetry. The raw DNA file is parsed, analyzed, and rendered entirely offline. If — and only if — you paste an API key and opt in to AI synthesis, OpenDNA sends the filtered *findings* (rsid + gene + genotype + tier + short note; roughly 40–50 lines of structured text) to the provider you chose. Your raw DNA file is never transmitted. See the [Privacy](#privacy--what-leaves-your-machine) section for the full rules. @@ -86,6 +86,8 @@ opendna --version pytest -v # (only if you installed [dev]) ``` +Current contributor baseline: `51` tests passing. + --- ## Quickstart — the web UI @@ -135,13 +137,13 @@ The JSON schema is stable and suitable for ingestion by another pipeline stage. ## What OpenDNA interprets -Every finding is tier-scored (`risk` / `warning` / `normal` / `unknown`) and, where available, cross-referenced with ClinVar clinical significance and PharmGKB/CPIC dosing guidelines. +Every finding is tier-scored (`risk` / `warning` / `normal` / `unknown`), annotated with match confidence, and, where available, cross-referenced with ClinVar clinical significance and PharmGKB/CPIC dosing guidelines. The report also separates `not on chip`, `no-call`, and `ambiguous` markers so missing data is not misread as a negative result. | Panel | Focus | Representative genes | |---|---|---| -| **Cardiovascular & Longevity** | CAD risk, lifespan | APOE, 9p21, FOXO3, LPA | +| **Cardiovascular & Longevity** | CAD risk, thrombosis, lifespan | APOE, 9p21, FOXO3, LPA, F5, F2 | | **Methylation & Detox** | Folate/B12 cycle | MTHFR, COMT, MTR/MTRR, FUT2, CBS, VDR | -| **Pharmacogenomics (PGx)** | Drug metabolism | CYP2C19, CYP2C9, VKORC1, SLCO1B1, TPMT, CYP3A5 | +| **Pharmacogenomics (PGx)** | Drug metabolism | CYP2C19, CYP2C9, VKORC1, CYP4F2, SLCO1B1, TPMT, CYP3A5 | | **Athletic Performance & Recovery** | Fiber type, recovery | ACTN3, PPARA, PPARGC1A, COL5A1, IL6 | | **Dietary Sensitivity** | Lactose, caffeine, alcohol, omega-3 | LCT, CYP1A2, ALDH2, FADS1, TAS2R38 | | **Iron Metabolism (HFE)** | Hemochromatosis | C282Y, H63D, S65C | @@ -150,6 +152,12 @@ Every finding is tier-scored (`risk` / `warning` / `normal` / `unknown`) and, wh The analyzer automatically handles both **allele-order variations** (`AG` == `GA`) and **reverse-strand reports** (panel `CC` matches file `GG` for C/T SNPs), so genotypes from any major consumer-testing vendor should resolve correctly. +### Source-file caveats, now surfaced in the report + +- `Not on chip` means the marker was not present in the source file. It does **not** mean the user has a reassuring genotype there. +- `No-call` means the vendor included the marker but did not make a confident genotype call. +- Composite calls are only made where the source file type can support them. OpenDNA still does **not** infer rare variants, structural variants, HLA types, CYP2D6 star alleles, or methylation from array data. + --- ## Troubleshooting diff --git a/docs/superpowers/handover-2026-04-17.md b/docs/superpowers/handover-2026-04-17.md index 9be0202..3598ecc 100644 --- a/docs/superpowers/handover-2026-04-17.md +++ b/docs/superpowers/handover-2026-04-17.md @@ -16,15 +16,16 @@ Local-first web utility that parses a raw 23andMe-style DNA file, matches agains ### Shipped features -- 8 curated SNP panels (cardiovascular, methylation, pharmacogenomics, athletic, dietary, HFE, cognition, stimulant sensitivity) — ~50 SNPs, each with ≥2 genotype interpretations. -- ClinVar (14 rsids) + PharmGKB (7 rsids, CPIC drug recs for TPMT / CYP2C19 / CYP2C9 / VKORC1 / SLCO1B1) shipped subsets. +- 8 curated SNP panels (cardiovascular, methylation, pharmacogenomics, athletic, dietary, HFE, cognition, stimulant sensitivity) — now expanded with F5 / F2 thrombophilia markers and CYP4F2 warfarin PGx support. +- ClinVar (16 rsids) + PharmGKB (8 rsids, CPIC drug recs for TPMT / CYP2C19 / CYP2C9 / VKORC1 / CYP4F2 / SLCO1B1) shipped subsets. - FastAPI server with five endpoints: `/`, `/api/panels`, `/api/analyze`, `/api/analyze-stream` (SSE for live progress), `/api/update-db`. - Vanilla-JS SPA: DOM APIs only (no `innerHTML` on dynamic data), report rendered into sandboxed iframe. Live progress bar with per-stage messages. Optional browser `localStorage` persistence of form values behind a *Remember* checkbox + *Clear saved values* button. - CLI: `opendna serve`, `opendna scan `, `opendna update-db`. - Jinja2 HTML report (autoescape on) styled after the original sandbox synthesis — stat tiles, color-coded card grid per panel, critical-findings alert box for risk tier, AI synthesis block when LLM is configured. +- Report now includes source-file QC, vendor/build heuristics, panel completeness, confidence labels, gene caveats, and composite summaries for APOE, HFE, MTHFR, CYP2C19, and warfarin PGx. - LLM provider layer: pluggable ABC, Anthropic default (`claude-sonnet-4-6` + prompt caching), OpenAI (`gpt-4o` using `max_completion_tokens`). API keys held per-request in memory only. - Analyzer normalizes allele order (`AG == GA`) AND reverse complement (panel `CC` matches file `GG` for non-palindromic C/T SNPs). -- 44 tests passing (parser, panels, analyzer, annotations, report, LLM providers, server, CLI, SSE stream, e2e). `ruff check` clean. CI runs on Python 3.11 / 3.12 / 3.13. +- 51 tests passing (parser, panels, analyzer, summaries, annotations, report, LLM providers, server, CLI, SSE stream, e2e). `ruff check` clean. CI runs on Python 3.11 / 3.12 / 3.13. - Apache 2.0 license. - README with full setup guide, per-provider raw-DNA download instructions, sample report + screenshot, troubleshooting, author links (`@corbett3000`, `stillrush.co/bio`). - Sample report: `examples/sample-report.html` (served live via raw.githack; preview image at `docs/assets/sample-report.png`). @@ -58,6 +59,8 @@ pytest -v # 44 tests should pass opendna serve # localhost:8787 ``` +The current baseline is `51` passing tests. + ## Key design decisions (for context) | Decision | Choice | Why | diff --git a/src/opendna/analyzer.py b/src/opendna/analyzer.py index a594636..448bed9 100644 --- a/src/opendna/analyzer.py +++ b/src/opendna/analyzer.py @@ -6,6 +6,16 @@ from opendna.models import Finding, Panel, SnpDef _COMPLEMENT = {"A": "T", "T": "A", "C": "G", "G": "C"} +_NO_CALL_GENOTYPES = {"--", "-", "00", "NN", "NC"} +_CONFIDENCE_SCORES = { + "exact": 1.0, + "normalized": 0.97, + "reverse_complement": 0.9, + "unmatched": 0.35, + "not_tested": 0.0, + "no_call": 0.0, + "ambiguous": 0.15, +} # A/T and C/G are palindromic sites: forward and reverse complements collide, # so RC inference is unreliable. Panels whose allele set is exactly one of @@ -50,7 +60,30 @@ def _is_palindromic_site(snp: SnpDef) -> bool: return _panel_alleles(snp) in _PALINDROMIC_PAIRS -def _match_interpretation(snp: SnpDef, genotype: str) -> tuple[str, str]: +def _classify_call_status(genotype: str | None) -> str: + if genotype is None: + return "not_tested" + genotype = genotype.upper() + if genotype in _NO_CALL_GENOTYPES: + return "no_call" + if not genotype or len(genotype) not in {1, 2}: + return "ambiguous" + if any(base not in "ACGT" for base in genotype): + return "ambiguous" + return "called" + + +def _confidence_label(score: float) -> str: + if score >= 0.9: + return "high" + if score >= 0.6: + return "medium" + if score > 0: + return "low" + return "none" + + +def _match_interpretation(snp: SnpDef, genotype: str) -> tuple[str, str, str, str | None]: """Match a genotype to a panel interpretation. Fallback chain: @@ -63,12 +96,12 @@ def _match_interpretation(snp: SnpDef, genotype: str) -> tuple[str, str]: # 1. exact if genotype in snp.interpretations: i = snp.interpretations[genotype] - return i.tier, i.note + return i.tier, i.note, "exact", genotype # 2. allele-order normalization norm = _normalize_genotype(genotype) for key, interp in snp.interpretations.items(): if _normalize_genotype(key) == norm: - return interp.tier, interp.note + return interp.tier, interp.note, "normalized", key # 3. reverse-complement (only for non-palindromic sites) if not _is_palindromic_site(snp): rc = _reverse_complement(genotype) @@ -76,8 +109,8 @@ def _match_interpretation(snp: SnpDef, genotype: str) -> tuple[str, str]: rc_norm = _normalize_genotype(rc) for key, interp in snp.interpretations.items(): if _normalize_genotype(key) == rc_norm: - return interp.tier, interp.note - return "unknown", f"Genotype {genotype} not interpreted in panel." + return interp.tier, interp.note, "reverse_complement", key + return "unknown", f"Genotype {genotype} not interpreted in panel.", "unmatched", None def analyze( @@ -92,29 +125,84 @@ def analyze( continue for snp in panel.snps: genotype = parsed.get(snp.rsid) - if genotype is None: + call_status = _classify_call_status(genotype) + if call_status == "not_tested": findings.append( Finding( panel_id=panel.id, rsid=snp.rsid, gene=snp.gene, genotype=None, + interpreted_genotype=None, tier="unknown", - note="SNP not present in raw DNA file.", + note="Marker not present in this source DNA file.", description=snp.description, + call_status="not_tested", + match_method="not_tested", + confidence_score=0.0, + confidence_label="none", ) ) continue - tier, note = _match_interpretation(snp, genotype) + if call_status == "no_call": + findings.append( + Finding( + panel_id=panel.id, + rsid=snp.rsid, + gene=snp.gene, + genotype=genotype, + interpreted_genotype=None, + tier="unknown", + note=( + "Marker is present in the file, but the source genotype " + "was a no-call." + ), + description=snp.description, + call_status="no_call", + match_method="no_call", + confidence_score=0.0, + confidence_label="none", + ) + ) + continue + if call_status == "ambiguous": + findings.append( + Finding( + panel_id=panel.id, + rsid=snp.rsid, + gene=snp.gene, + genotype=genotype, + interpreted_genotype=None, + tier="unknown", + note=( + "Marker is present, but the source genotype format is " + "ambiguous for this panel." + ), + description=snp.description, + call_status="ambiguous", + match_method="ambiguous", + confidence_score=_CONFIDENCE_SCORES["ambiguous"], + confidence_label=_confidence_label(_CONFIDENCE_SCORES["ambiguous"]), + ) + ) + continue + + tier, note, match_method, interpreted_genotype = _match_interpretation(snp, genotype) + confidence_score = _CONFIDENCE_SCORES[match_method] findings.append( Finding( panel_id=panel.id, rsid=snp.rsid, gene=snp.gene, genotype=genotype, + interpreted_genotype=interpreted_genotype, tier=tier, note=note, description=snp.description, + call_status="called", + match_method=match_method, + confidence_score=confidence_score, + confidence_label=_confidence_label(confidence_score), ) ) return findings diff --git a/src/opendna/annotations/clinvar.json b/src/opendna/annotations/clinvar.json index 3991053..ed7a62b 100644 --- a/src/opendna/annotations/clinvar.json +++ b/src/opendna/annotations/clinvar.json @@ -82,5 +82,17 @@ "condition": "Simvastatin-induced myopathy", "review_status": "reviewed by expert panel (PharmGKB)", "source": "ClinVar 2026-03 snapshot" + }, + "rs6025": { + "clinical_significance": "pathogenic / risk factor", + "condition": "Factor V Leiden thrombophilia; venous thromboembolism", + "review_status": "multiple submitters", + "source": "ClinVar 2026-04 snapshot" + }, + "rs1799963": { + "clinical_significance": "pathogenic (low penetrance)", + "condition": "Prothrombin-related thrombophilia; venous thromboembolism", + "review_status": "multiple submitters", + "source": "ClinVar 2026-04 snapshot" } } diff --git a/src/opendna/annotations/pharmgkb.json b/src/opendna/annotations/pharmgkb.json index 6d425a6..7358f61 100644 --- a/src/opendna/annotations/pharmgkb.json +++ b/src/opendna/annotations/pharmgkb.json @@ -106,6 +106,18 @@ } ] }, + "rs2108622": { + "gene": "CYP4F2", + "recommendations": [ + { + "drug": "warfarin", + "diplotype": "*3 carrier", + "recommendation": "Use a CYP2C9/VKORC1-guided dosing algorithm and recognize that CYP4F2 carriers can need a modestly higher dose.", + "evidence_level": "2A", + "source": "CPIC 2017" + } + ] + }, "rs4149056": { "gene": "SLCO1B1", "recommendations": [ diff --git a/src/opendna/cli.py b/src/opendna/cli.py index 9e336f1..b52c921 100644 --- a/src/opendna/cli.py +++ b/src/opendna/cli.py @@ -11,8 +11,9 @@ from opendna.annotations import annotate, load_clinvar, load_pharmgkb from opendna.annotations.updater import refresh from opendna.panels import load_panels -from opendna.parser import parse_23andme +from opendna.parser import parse_source_file from opendna.report import render_report +from opendna.summaries import build_analysis_summary def build_parser() -> argparse.ArgumentParser: @@ -56,12 +57,16 @@ def cmd_scan(args: argparse.Namespace) -> int: if not input_path.exists(): print(f"error: file not found: {input_path}", file=sys.stderr) return 1 - parsed = parse_23andme(input_path) + parsed = parse_source_file(input_path) panels = load_panels() selected = set(args.panels) if args.panels else None - findings = analyze(parsed, panels, selected_panel_ids=selected) + findings = analyze(parsed.genotypes, panels, selected_panel_ids=selected) findings = annotate(findings, load_clinvar(), load_pharmgkb()) - bundle = render_report(findings) + bundle = render_report( + findings, + source_file=parsed.source, + analysis_summary=build_analysis_summary(findings, panels), + ) html_out = input_path.with_name(input_path.stem + ".opendna.html") json_out = input_path.with_name(input_path.stem + ".opendna.json") diff --git a/src/opendna/llm/base.py b/src/opendna/llm/base.py index 61e6f07..fe3d2bd 100644 --- a/src/opendna/llm/base.py +++ b/src/opendna/llm/base.py @@ -18,7 +18,10 @@ def findings_to_prompt(findings: list[Finding]) -> str: """Render findings as a compact structured block for the LLM.""" lines = [] for f in findings: - line = f"[{f.tier}] {f.gene} {f.rsid} = {f.genotype or '--'} ({f.panel_id}) — {f.note}" + if f.call_status != "called": + continue + display_genotype = f.interpreted_genotype or f.genotype or "--" + line = f"[{f.tier}] {f.gene} {f.rsid} = {display_genotype} ({f.panel_id}) — {f.note}" if f.clinvar: line += f" | ClinVar: {f.clinvar['clinical_significance']}" if f.pharmgkb: diff --git a/src/opendna/models.py b/src/opendna/models.py index 71ff70d..1e74b52 100644 --- a/src/opendna/models.py +++ b/src/opendna/models.py @@ -6,6 +6,17 @@ from pydantic import BaseModel, Field Tier = Literal["normal", "warning", "risk", "unknown"] +CallStatus = Literal["called", "not_tested", "no_call", "ambiguous"] +MatchMethod = Literal[ + "exact", + "normalized", + "reverse_complement", + "unmatched", + "not_tested", + "no_call", + "ambiguous", +] +ConfidenceLabel = Literal["high", "medium", "low", "none"] class Interpretation(BaseModel): @@ -33,15 +44,90 @@ class Finding(BaseModel): rsid: str gene: str genotype: str | None + interpreted_genotype: str | None = None tier: Tier note: str description: str + call_status: CallStatus = "called" + match_method: MatchMethod = "exact" + confidence_score: float = 1.0 + confidence_label: ConfidenceLabel = "high" clinvar: dict | None = None pharmgkb: list[dict] | None = None +class ParseIssue(BaseModel): + severity: Literal["info", "warning"] + code: str + message: str + + +class SourceFileInfo(BaseModel): + path: str + vendor: str | None = None + build: str | None = None + unique_rsid_count: int + parsed_row_count: int + malformed_row_count: int = 0 + duplicate_rsid_count: int = 0 + no_call_count: int = 0 + ambiguous_call_count: int = 0 + comment_line_count: int = 0 + chromosome_labels: list[str] = Field(default_factory=list) + blind_spots: list[str] = Field(default_factory=list) + issues: list[ParseIssue] = Field(default_factory=list) + + +class ParseResult(BaseModel): + genotypes: dict[str, str] + source: SourceFileInfo + + +class PanelSummary(BaseModel): + panel_id: str + panel_name: str + total_snps: int + called_count: int + not_tested_count: int + no_call_count: int + ambiguous_count: int + risk_count: int + warning_count: int + normal_count: int + assessed_pct: int + + +class DerivedInsight(BaseModel): + id: str + title: str + summary: str + tier: Tier + confidence_score: float + confidence_label: ConfidenceLabel + genes: list[str] = Field(default_factory=list) + panel_ids: list[str] = Field(default_factory=list) + rsids: list[str] = Field(default_factory=list) + + +class GeneSummary(BaseModel): + gene: str + tier: Tier + summary: str + panels: list[str] = Field(default_factory=list) + rsids: list[str] = Field(default_factory=list) + caveat: str | None = None + + +class AnalysisSummary(BaseModel): + panel_summaries: list[PanelSummary] = Field(default_factory=list) + derived_insights: list[DerivedInsight] = Field(default_factory=list) + gene_summaries: list[GeneSummary] = Field(default_factory=list) + + class ReportBundle(BaseModel): findings: list[Finding] html: str json_payload: dict llm_prose: str | None = None + source_file: SourceFileInfo | None = None + analysis_summary: AnalysisSummary | None = None diff --git a/src/opendna/panels/cardiovascular.json b/src/opendna/panels/cardiovascular.json index d7cc605..c62f9dd 100644 --- a/src/opendna/panels/cardiovascular.json +++ b/src/opendna/panels/cardiovascular.json @@ -1,7 +1,7 @@ { "id": "cardiovascular", "name": "Cardiovascular & Longevity", - "description": "Variants associated with coronary artery disease risk and lifespan.", + "description": "Variants associated with coronary artery disease risk, thrombosis, and lifespan.", "snps": [ { "rsid": "rs429358", @@ -57,6 +57,28 @@ "CT": {"tier": "risk", "note": "Risk allele present — consider measuring Lp(a)."}, "CC": {"tier": "risk", "note": "Homozygous risk — strongly associated with elevated Lp(a)."} } + }, + { + "rsid": "rs6025", + "gene": "F5", + "variant_name": "Factor V Leiden", + "description": "Classic inherited thrombophilia variant; venous clot risk rises further with estrogen, surgery, pregnancy, or immobility.", + "interpretations": { + "CC": {"tier": "normal", "note": "No Factor V Leiden variant detected at this site."}, + "CT": {"tier": "warning", "note": "Factor V Leiden carrier — baseline venous clot risk is meaningfully elevated."}, + "TT": {"tier": "risk", "note": "Factor V Leiden homozygous — strongest common inherited clotting-risk pattern in this file type."} + } + }, + { + "rsid": "rs1799963", + "gene": "F2", + "variant_name": "Prothrombin G20210A", + "description": "Common inherited thrombophilia variant associated with higher prothrombin levels and venous thrombosis risk.", + "interpretations": { + "GG": {"tier": "normal", "note": "No prothrombin G20210A variant detected at this site."}, + "AG": {"tier": "warning", "note": "Carrier — venous thrombosis risk is elevated versus baseline."}, + "AA": {"tier": "risk", "note": "Homozygous carrier — higher clotting-risk signal than heterozygous carriage."} + } } ] } diff --git a/src/opendna/panels/pharmacogenomics.json b/src/opendna/panels/pharmacogenomics.json index f779edc..99028b0 100644 --- a/src/opendna/panels/pharmacogenomics.json +++ b/src/opendna/panels/pharmacogenomics.json @@ -58,6 +58,17 @@ "TT": {"tier": "risk", "note": "High sensitivity — lowest starting dose band."} } }, + { + "rsid": "rs2108622", + "gene": "CYP4F2", + "variant_name": "*3 (V433M)", + "description": "Modestly increases warfarin dose requirement; most useful when combined with CYP2C9 and VKORC1.", + "interpretations": { + "CC": {"tier": "normal", "note": "Baseline CYP4F2 contribution to warfarin dosing."}, + "CT": {"tier": "warning", "note": "Variant present — some patients need a modestly higher warfarin dose than CYP2C9/VKORC1 alone would predict."}, + "TT": {"tier": "warning", "note": "Strongest common CYP4F2 dose-increase signal; pair with CYP2C9 and VKORC1 rather than reading in isolation."} + } + }, { "rsid": "rs4149056", "gene": "SLCO1B1", diff --git a/src/opendna/parser.py b/src/opendna/parser.py index 7f2d522..e9f9867 100644 --- a/src/opendna/parser.py +++ b/src/opendna/parser.py @@ -3,24 +3,238 @@ from pathlib import Path +from opendna.models import ParseIssue, ParseResult, SourceFileInfo -def parse_23andme(path: Path | str) -> dict[str, str]: - """Parse a 23andMe-format TSV into {rsid: genotype}. +_VENDOR_PATTERNS = { + "23andMe": ("23andme", "23and me", "fileformat=23andme"), + "AncestryDNA": ("ancestrydna", "ancestry dna"), + "MyHeritage": ("myheritage",), + "FTDNA": ("ftdna", "family tree dna", "myftdna"), +} - Lines starting with '#' are treated as comments. Malformed rows are - silently skipped — upstream providers occasionally ship corrupt lines. - """ +_VENDOR_BLIND_SPOTS = { + "23andMe": [ + "Chip versions vary, so absent markers often reflect array design rather " + "than a true negative result.", + "This file is good for common SNPs but still misses rare variants, " + "structural variants, and most CYP2D6/HLA haplotypes.", + ], + "AncestryDNA": [ + "Absent pharmacogenomic markers often reflect chip design, not biology.", + "Common SNP coverage is useful, but rare pathogenic variants and " + "structural changes remain invisible.", + ], + "MyHeritage": [ + "Coverage is strongest for common SNPs; niche pharmacogenomic markers are patchier.", + "Rare variants, structural variants, and haplotype calls still require sequencing data.", + ], + "FTDNA": [ + "A missing rsid here should be read as vendor omission, not a negative genotype.", + "This file cannot resolve structural variants, copy-number changes, or " + "most star-allele haplotypes.", + ], +} + +_GENERIC_BLIND_SPOTS = [ + "Consumer array files only assay a tiny fraction of the genome, so normal " + "here never rules out other variants in the same gene.", + "This file type does not support rare-variant calling, structural variants, " + "methylation, HLA typing, or most CYP2D6 star-allele inference.", +] + +_NO_CALL_GENOTYPES = {"--", "-", "00", "NN", "NC"} + + +def _detect_vendor(comment_lines: list[str], file_name: str) -> str | None: + haystack = "\n".join(comment_lines + [file_name]).lower() + for vendor, patterns in _VENDOR_PATTERNS.items(): + if any(pattern in haystack for pattern in patterns): + return vendor + return None + + +def _detect_build(comment_lines: list[str]) -> str | None: + haystack = "\n".join(comment_lines).lower() + if "grch38" in haystack or "build 38" in haystack: + return "GRCh38" + if "grch37" in haystack or "build 37" in haystack: + return "GRCh37" + return None + + +def _classify_genotype(genotype: str) -> str: + if genotype in _NO_CALL_GENOTYPES: + return "no_call" + if not genotype: + return "ambiguous" + if any(base not in "ACGT" for base in genotype): + return "ambiguous" + if len(genotype) not in {1, 2}: + return "ambiguous" + return "called" + + +def _build_issues( + unique_rsid_count: int, + malformed_row_count: int, + duplicate_rsid_count: int, + no_call_count: int, + ambiguous_call_count: int, + vendor: str | None, + build: str | None, +) -> list[ParseIssue]: + issues: list[ParseIssue] = [] + if unique_rsid_count < 50_000: + issues.append( + ParseIssue( + severity="warning", + code="small-file", + message=( + "This raw DNA file is unusually small. Treat absent markers very cautiously; " + "the file may be truncated or heavily filtered." + ), + ) + ) + if malformed_row_count: + issues.append( + ParseIssue( + severity="warning", + code="malformed-rows", + message=f"Skipped {malformed_row_count} malformed rows while parsing the file.", + ) + ) + if duplicate_rsid_count: + issues.append( + ParseIssue( + severity="warning", + code="duplicate-rsids", + message=( + f"Found {duplicate_rsid_count} duplicate rsids. The last occurrence was kept " + "for analysis." + ), + ) + ) + if no_call_count: + issues.append( + ParseIssue( + severity="info", + code="no-calls", + message=( + f"{no_call_count} markers were present in the file but had no " + "confident genotype call." + ), + ) + ) + if ambiguous_call_count: + issues.append( + ParseIssue( + severity="warning", + code="ambiguous-calls", + message=( + f"{ambiguous_call_count} markers used a non-standard genotype format and were " + "not treated as high-confidence calls." + ), + ) + ) + if vendor is None: + issues.append( + ParseIssue( + severity="info", + code="vendor-unknown", + message="Vendor could not be identified heuristically from the file header.", + ) + ) + if build is None: + issues.append( + ParseIssue( + severity="info", + code="build-unknown", + message="Reference build was not detected from the file header.", + ) + ) + return issues + + +def parse_source_file(path: Path | str) -> ParseResult: + """Parse a consumer-DNA TSV into genotypes plus file-level metadata.""" path = Path(path) results: dict[str, str] = {} + comment_lines: list[str] = [] + chromosomes: set[str] = set() + parsed_row_count = 0 + malformed_row_count = 0 + duplicate_rsid_count = 0 + no_call_count = 0 + ambiguous_call_count = 0 + with path.open(encoding="utf-8-sig") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): + for raw_line in f: + line = raw_line.strip() + if not line: + continue + if line.startswith("#"): + comment_lines.append(line) continue + parts = line.split("\t") if len(parts) < 4: + malformed_row_count += 1 + continue + + rsid, chrom, _pos, genotype = parts[:4] + if not rsid.startswith("rs"): + malformed_row_count += 1 continue - rsid, _chrom, _pos, genotype = parts[:4] - if rsid.startswith("rs"): - results[rsid] = genotype - return results + + parsed_row_count += 1 + chromosomes.add(chrom) + genotype = genotype.strip().upper() + if rsid in results: + duplicate_rsid_count += 1 + results[rsid] = genotype + + status = _classify_genotype(genotype) + if status == "no_call": + no_call_count += 1 + elif status == "ambiguous": + ambiguous_call_count += 1 + + vendor = _detect_vendor(comment_lines, path.name) + build = _detect_build(comment_lines) + issues = _build_issues( + unique_rsid_count=len(results), + malformed_row_count=malformed_row_count, + duplicate_rsid_count=duplicate_rsid_count, + no_call_count=no_call_count, + ambiguous_call_count=ambiguous_call_count, + vendor=vendor, + build=build, + ) + blind_spots = _GENERIC_BLIND_SPOTS + _VENDOR_BLIND_SPOTS.get(vendor, []) + return ParseResult( + genotypes=results, + source=SourceFileInfo( + path=str(path), + vendor=vendor, + build=build, + unique_rsid_count=len(results), + parsed_row_count=parsed_row_count, + malformed_row_count=malformed_row_count, + duplicate_rsid_count=duplicate_rsid_count, + no_call_count=no_call_count, + ambiguous_call_count=ambiguous_call_count, + comment_line_count=len(comment_lines), + chromosome_labels=sorted(chromosomes), + blind_spots=blind_spots, + issues=issues, + ), + ) + + +def parse_23andme(path: Path | str) -> dict[str, str]: + """Parse a 23andMe-format TSV into {rsid: genotype}. + + Lines starting with '#' are treated as comments. Malformed rows are + silently skipped — upstream providers occasionally ship corrupt lines. + """ + return parse_source_file(path).genotypes diff --git a/src/opendna/report/render.py b/src/opendna/report/render.py index bc1549e..59ebc52 100644 --- a/src/opendna/report/render.py +++ b/src/opendna/report/render.py @@ -7,8 +7,9 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape -from opendna.models import Finding, ReportBundle +from opendna.models import AnalysisSummary, Finding, ReportBundle, SourceFileInfo from opendna.panels import load_panels +from opendna.summaries import build_analysis_summary, gene_caveats def _group_by_panel(findings: list[Finding]) -> dict[str, list[Finding]]: @@ -32,6 +33,8 @@ def _panel_names() -> dict[str, str]: def render_report( findings: list[Finding], llm_prose: str | None = None, + source_file: SourceFileInfo | None = None, + analysis_summary: AnalysisSummary | None = None, ) -> ReportBundle: template_dir = files("opendna.report") env = Environment( @@ -43,10 +46,11 @@ def render_report( grouped = _group_by_panel(findings) counts = _counts_by_tier(findings) generated_at = datetime.now(UTC).isoformat(timespec="seconds") - findings_visible = sum(1 for f in findings if f.genotype is not None) + findings_visible = sum(1 for f in findings if f.call_status == "called") + analysis_summary = analysis_summary or build_analysis_summary(findings, load_panels()) critical_findings = [ f for f in findings - if f.tier == "risk" and f.genotype is not None + if f.tier == "risk" and f.call_status == "called" ][:5] html = template.render( @@ -58,6 +62,12 @@ def render_report( findings_visible=findings_visible, critical_findings=critical_findings, panel_names=_panel_names(), + source_file=source_file, + analysis_summary=analysis_summary, + panel_summary_map={ + summary.panel_id: summary for summary in analysis_summary.panel_summaries + }, + gene_caveats=gene_caveats(), ) json_payload = { "generated_at": generated_at, @@ -65,10 +75,14 @@ def render_report( "counts_by_tier": counts, "findings": [f.model_dump() for f in findings], "llm_prose": llm_prose, + "source_file": source_file.model_dump() if source_file else None, + "analysis_summary": analysis_summary.model_dump(), } return ReportBundle( findings=findings, html=html, json_payload=json_payload, llm_prose=llm_prose, + source_file=source_file, + analysis_summary=analysis_summary, ) diff --git a/src/opendna/report/template.html.j2 b/src/opendna/report/template.html.j2 index d5d41f8..8abead2 100644 --- a/src/opendna/report/template.html.j2 +++ b/src/opendna/report/template.html.j2 @@ -7,21 +7,22 @@ @@ -98,32 +131,163 @@
Generated
{{ generated_at.split("T")[0] }}
-
{{ findings_total }} SNPs scanned · {{ findings_visible }} with data
+
{{ findings_total }} target SNPs · {{ findings_visible }} confidently assessed
-
{{ counts.risk }}
Risk
-
{{ counts.warning }}
Warnings
-
{{ counts.normal }}
Normal
-
{{ counts.unknown }}
Not in file
+
+
{{ counts.risk }}
+
Risk
+
+
+
{{ counts.warning }}
+
Warnings
+
+
+
{{ counts.normal }}
+
Normal
+
+
+
{{ counts.unknown }}
+
Unresolved
+
not on chip, no-call, ambiguous, or not interpreted
+
+ {% if source_file %} +
+

Source File Quality

+

These checks describe what this raw DNA file can and cannot support. Missing markers should be read as “not assessed,” not as a negative result.

+
+
+
Vendor
+

{{ source_file.vendor or "Unknown" }}

+
+
+
Reference Build
+

{{ source_file.build or "Unknown" }}

+
+
+
Unique Rsids
+

{{ source_file.unique_rsid_count }}

+
+
+
No-Calls / Ambiguous
+

{{ source_file.no_call_count }} / {{ source_file.ambiguous_call_count }}

+
+
+ {% if source_file.issues %} +
+
QC notes
+
    + {% for issue in source_file.issues %} +
  • {{ issue.severity|title }}: {{ issue.message }}
  • + {% endfor %} +
+
+ {% endif %} + {% if source_file.blind_spots %} +
+
Known blind spots for this file type
+
    + {% for note in source_file.blind_spots %} +
  • {{ note }}
  • + {% endfor %} +
+
+ {% endif %} +
+ {% endif %} + {% if critical_findings %}
-

⚠ Critical findings

+

Critical Findings

{% for f in critical_findings %}
- {{ f.gene }} ({{ f.rsid }} {{ f.genotype }}): + {{ f.gene }} ({{ f.rsid }} {{ f.interpreted_genotype or f.genotype }}): {{ f.note }} {% if f.pharmgkb %} - Drug considerations: {% for r in f.pharmgkb %}{{ r.drug }}{% if not loop.last %}, {% endif %}{% endfor %}. + Drug considerations: + {% for r in f.pharmgkb %} + {{ r.drug }}{% if not loop.last %}, {% endif %} + {% endfor %}. {% endif %}
{% endfor %}
{% endif %} + {% if analysis_summary.derived_insights %} +
+

Composite Calls

+

These combine multiple source-file markers into a single interpretation, which is usually more useful than reading each rsid in isolation.

+
+ {% for insight in analysis_summary.derived_insights %} +
+
+

{{ insight.title }}

+ {{ insight.tier }} +
+

{{ insight.summary }}

+
+
Genes: {{ insight.genes | join(", ") }}
+
Inputs: {{ insight.rsids | join(", ") }}
+
Confidence: {{ (insight.confidence_score * 100) | round | int }}% · {{ insight.confidence_label }}
+
+
+ {% endfor %} +
+
+ {% endif %} + + {% if analysis_summary.panel_summaries %} +
+

Panel Coverage

+

A section is only as good as the markers this file actually covered. These counts separate assessed markers from missing or unusable ones.

+
+ {% for panel in analysis_summary.panel_summaries %} +
+
{{ panel.panel_name }}
+

{{ panel.called_count }}/{{ panel.total_snps }}

+
assessed · {{ panel.assessed_pct }}%
+
+
Not on chip: {{ panel.not_tested_count }}
+
No-call: {{ panel.no_call_count }}
+
Ambiguous: {{ panel.ambiguous_count }}
+
Risk / Warning / Normal: {{ panel.risk_count }} / {{ panel.warning_count }} / {{ panel.normal_count }}
+
+
+ {% endfor %} +
+
+ {% endif %} + + {% if analysis_summary.gene_summaries %} +
+

Cross-Panel Gene View

+

These roll up duplicate or multi-marker genes so the analysis reads at the gene level, not just the panel level.

+
+ {% for gene in analysis_summary.gene_summaries %} +
+
+

{{ gene.gene }}

+ {{ gene.tier }} +
+

{{ gene.summary }}

+
+
Panels: {{ gene.panels | join(", ") }}
+
Rsids: {{ gene.rsids | join(", ") }}
+ {% if gene.caveat %} +
Boundary: {{ gene.caveat }}
+ {% endif %} +
+
+ {% endfor %} +
+
+ {% endif %} + {% if llm_prose %}

AI Synthesis

@@ -132,31 +296,49 @@ {% endif %} {% for panel_id, items in grouped.items() %} - {% set visible = items | selectattr("genotype") | list %} + {% set visible = items | selectattr("call_status", "equalto", "called") | list %} + {% set panel_summary = panel_summary_map.get(panel_id) %} {% if visible %}

{{ panel_names.get(panel_id, panel_id|replace('_', ' ')|title) }}

+ {% if panel_summary %} +

+ {{ panel_summary.called_count }}/{{ panel_summary.total_snps }} assessed · + {{ panel_summary.not_tested_count }} not on chip · + {{ panel_summary.no_call_count }} no-call · + {{ panel_summary.ambiguous_count }} ambiguous +

+ {% endif %}
{% for f in visible %}
-
+
{{ f.gene }}
{{ f.tier }}
-
{{ f.genotype }} · {{ f.rsid }}
+
+ {{ f.interpreted_genotype or f.genotype }} · {{ f.rsid }} + {% if f.interpreted_genotype and f.genotype and f.interpreted_genotype != f.genotype %} + (raw {{ f.genotype }}) + {% endif %} +
{{ f.note }}
- {% if f.clinvar or f.pharmgkb %} -
+
+
Confidence: {{ (f.confidence_score * 100) | round | int }}% · {{ f.confidence_label }} via {{ f.match_method|replace("_", " ") }}
+ {% if gene_caveats.get(f.gene) %} +
Boundary: {{ gene_caveats.get(f.gene) }}
+ {% endif %} {% if f.clinvar %} -
ClinVar: {{ f.clinvar.clinical_significance }} — {{ f.clinvar.condition }}
+
ClinVar: {{ f.clinvar.clinical_significance }} — {{ f.clinvar.condition }}
+
ClinVar review: {{ f.clinvar.review_status }}{% if f.clinvar.source %} · {{ f.clinvar.source }}{% endif %}
{% endif %} {% if f.pharmgkb %} {% for rec in f.pharmgkb %} -
{{ rec.drug|title }}: {{ rec.recommendation }}
+
{{ rec.drug|title }}: {{ rec.recommendation }}
+
PGx evidence: {{ rec.evidence_level }}{% if rec.source %} · {{ rec.source }}{% endif %}
{% endfor %} {% endif %}
- {% endif %}
{% endfor %}
@@ -166,7 +348,7 @@
Generated by OpenDNA - · ClinVar (NIH, public domain) · PharmGKB under CC BY-SA 4.0 + · ClinVar (NIH, public domain) · PharmGKB / CPIC under CC BY-SA 4.0
diff --git a/src/opendna/server.py b/src/opendna/server.py index 50626f5..2ef24bc 100644 --- a/src/opendna/server.py +++ b/src/opendna/server.py @@ -21,8 +21,9 @@ from opendna.annotations.updater import refresh from opendna.llm import get_provider from opendna.panels import load_panels -from opendna.parser import parse_23andme +from opendna.parser import parse_source_file from opendna.report import render_report +from opendna.summaries import build_analysis_summary logger = logging.getLogger("opendna.server") @@ -67,14 +68,15 @@ def analyze_endpoint(req: AnalyzeRequest) -> JSONResponse: raise HTTPException(status_code=400, detail=f"File not found: {req.file_path}") try: - parsed = parse_23andme(path) + parsed = parse_source_file(path) except Exception as exc: raise HTTPException(status_code=400, detail=f"Parse error: {exc}") from exc panels = load_panels() selected = set(req.selected_panels) if req.selected_panels else None - findings = analyze(parsed, panels, selected_panel_ids=selected) + findings = analyze(parsed.genotypes, panels, selected_panel_ids=selected) findings = annotate(findings, load_clinvar(), load_pharmgkb()) + summary = build_analysis_summary(findings, panels) prose: str | None = None if req.llm is not None: @@ -86,7 +88,12 @@ def analyze_endpoint(req: AnalyzeRequest) -> JSONResponse: logger.exception("LLM synthesis failed") raise HTTPException(status_code=502, detail=f"LLM provider error: {exc}") from exc - bundle = render_report(findings, llm_prose=prose) + bundle = render_report( + findings, + llm_prose=prose, + source_file=parsed.source, + analysis_summary=summary, + ) return JSONResponse({ "report_html": bundle.html, "report_json": bundle.json_payload, @@ -107,13 +114,21 @@ def _analyze_stream_generator(req: AnalyzeRequest) -> Iterator[str]: yield _sse_event("progress", stage="parse", message="Parsing DNA file", pct=12) try: - parsed = parse_23andme(path) + parsed = parse_source_file(path) except Exception as exc: yield _sse_event("error", detail=f"Parse error: {exc}") return yield _sse_event( "progress", stage="parse", - message=f"Parsed {len(parsed):,} SNPs from file", pct=30, + message=( + f"Parsed {parsed.source.unique_rsid_count:,} SNPs" + + ( + f" ({parsed.source.vendor}, {parsed.source.build or 'build unknown'})" + if parsed.source.vendor + else "" + ) + ), + pct=30, ) yield _sse_event( @@ -122,7 +137,7 @@ def _analyze_stream_generator(req: AnalyzeRequest) -> Iterator[str]: ) panels = load_panels() selected = set(req.selected_panels) if req.selected_panels else None - findings = analyze(parsed, panels, selected_panel_ids=selected) + findings = analyze(parsed.genotypes, panels, selected_panel_ids=selected) matched = sum(1 for f in findings if f.genotype is not None) yield _sse_event( "progress", stage="analyze", @@ -135,6 +150,7 @@ def _analyze_stream_generator(req: AnalyzeRequest) -> Iterator[str]: message="Joining ClinVar + PharmGKB annotations", pct=62, ) findings = annotate(findings, load_clinvar(), load_pharmgkb()) + summary = build_analysis_summary(findings, panels) prose: str | None = None if req.llm is not None: @@ -157,7 +173,12 @@ def _analyze_stream_generator(req: AnalyzeRequest) -> Iterator[str]: return yield _sse_event("progress", stage="render", message="Rendering report", pct=95) - bundle = render_report(findings, llm_prose=prose) + bundle = render_report( + findings, + llm_prose=prose, + source_file=parsed.source, + analysis_summary=summary, + ) yield _sse_event( "complete", pct=100, report_html=bundle.html, diff --git a/src/opendna/summaries.py b/src/opendna/summaries.py new file mode 100644 index 0000000..58eabe7 --- /dev/null +++ b/src/opendna/summaries.py @@ -0,0 +1,517 @@ +"""Higher-level summaries layered on top of per-rsid findings.""" +from __future__ import annotations + +from collections import defaultdict + +from opendna.models import ( + AnalysisSummary, + DerivedInsight, + Finding, + GeneSummary, + Panel, + PanelSummary, +) + +_TIER_RANK = {"risk": 3, "warning": 2, "normal": 1, "unknown": 0} + +GENE_CAVEATS = { + "APOE": ( + "APOE status here comes from rs429358 + rs7412 only; it does not capture " + "non-APOE causes of dementia or lipid risk." + ), + "CYP2C19": ( + "This captures the common *2 and *17 markers only; other loss-of-function " + "alleles are not assessed in most consumer array files." + ), + "CYP2C9": ( + "This covers the common *2 and *3 markers only; other CYP2C9 star alleles " + "remain untested." + ), + "CYP3A5": ( + "rs776746 captures the common expressor split, not the full CYP3A5 " + "star-allele space." + ), + "CYP4F2": ( + "CYP4F2 contributes modestly to warfarin dose and should be read " + "alongside CYP2C9 and VKORC1." + ), + "F2": ( + "This common thrombophilia marker raises baseline clot risk but does not " + "diagnose an active clotting disorder." + ), + "F5": ( + "This array marker flags inherited thrombophilia risk, not current clotting " + "status or treatment need." + ), + "HFE": ( + "These common HFE variants explain many classic hemochromatosis patterns, " + "but not every cause of iron overload or anemia." + ), + "MTHFR": ( + "Common MTHFR SNPs do not directly diagnose folate deficiency; " + "homocysteine, B12, folate, and diet still matter." + ), + "SLCO1B1": ( + "rs4149056 captures the best-known simvastatin myopathy signal, not the " + "full statin-response landscape." + ), + "TPMT": ( + "This panel captures TPMT *3C only; other TPMT and NUDT15 variants can " + "still be clinically important." + ), + "VKORC1": ( + "Warfarin response still depends on age, diet, liver function, " + "interacting drugs, and other genes." + ), +} + + +def gene_caveats() -> dict[str, str]: + return GENE_CAVEATS.copy() + + +def _top_tier(*tiers: str) -> str: + return max(tiers, key=lambda tier: _TIER_RANK[tier], default="unknown") + + +def _confidence_label(score: float) -> str: + if score >= 0.9: + return "high" + if score >= 0.6: + return "medium" + if score > 0: + return "low" + return "none" + + +def _canonical_genotype(finding: Finding | None) -> str | None: + if finding is None or finding.call_status != "called": + return None + return finding.interpreted_genotype or finding.genotype + + +def _variant_count(finding: Finding | None, allele: str) -> int | None: + genotype = _canonical_genotype(finding) + if genotype is None: + return None + return genotype.count(allele) + + +def _unique_sorted(values: list[str]) -> list[str]: + return sorted({value for value in values if value}) + + +def _unique_notes(findings: list[Finding]) -> list[str]: + seen: list[str] = [] + for finding in findings: + if finding.note not in seen: + seen.append(finding.note) + return seen + + +def _present_findings(*findings: Finding | None) -> list[Finding]: + return [finding for finding in findings if finding is not None] + + +def _make_insight( + insight_id: str, + title: str, + summary: str, + tier: str, + findings: list[Finding], +) -> DerivedInsight: + confidence = min((finding.confidence_score for finding in findings), default=0.0) + return DerivedInsight( + id=insight_id, + title=title, + summary=summary, + tier=tier, + confidence_score=confidence, + confidence_label=_confidence_label(confidence), + genes=_unique_sorted([finding.gene for finding in findings]), + panel_ids=_unique_sorted([finding.panel_id for finding in findings]), + rsids=_unique_sorted([finding.rsid for finding in findings]), + ) + + +def _apoe_insight(rsid_map: dict[str, Finding]) -> DerivedInsight | None: + rs429358 = rsid_map.get("rs429358") + rs7412 = rsid_map.get("rs7412") + g1 = _canonical_genotype(rs429358) + g2 = _canonical_genotype(rs7412) + if g1 is None or g2 is None: + return None + + mapping: dict[tuple[str, str], tuple[str, str, str]] = { + ("TT", "TT"): ( + "normal", + "APOE composite: e2/e2.", + "Often associated with lower LDL, but homozygous e2 can predispose to " + "type III hyperlipoproteinemia in the right metabolic context.", + ), + ("TT", "CT"): ( + "normal", + "APOE composite: e2/e3.", + "This usually behaves as a lower-risk APOE pattern than e3/e4 or e4/e4.", + ), + ("TT", "CC"): ( + "normal", + "APOE composite: e3/e3.", + "This is the baseline APOE pattern used for most population comparisons.", + ), + ("CT", "CT"): ( + "warning", + "APOE composite: e2/e4.", + "This is a mixed pattern: one e4 allele raises neuro/cardiometabolic " + "risk, while one e2 allele can shift lipid behavior.", + ), + ("CT", "CC"): ( + "warning", + "APOE composite: e3/e4.", + "One e4 allele is present, which is associated with higher Alzheimer " + "and cardiovascular risk than e3/e3.", + ), + ("CC", "CC"): ( + "risk", + "APOE composite: e4/e4.", + "Two e4 alleles are present, which is the highest-risk common APOE " + "pattern in this assay.", + ), + } + resolved = mapping.get((g1, g2)) + if resolved is None: + return None + + tier, headline, detail = resolved + return _make_insight( + "apoe", + "APOE Composite", + f"{headline} {detail}", + tier, + _present_findings(rs429358, rs7412), + ) + + +def _hfe_insight(rsid_map: dict[str, Finding]) -> DerivedInsight | None: + c282y = rsid_map.get("rs1800562") + h63d = rsid_map.get("rs1799945") + s65c = rsid_map.get("rs1800730") + c282y_count = _variant_count(c282y, "A") + h63d_count = _variant_count(h63d, "G") + s65c_count = _variant_count(s65c, "T") + counts = [count for count in (c282y_count, h63d_count, s65c_count) if count is not None] + if not counts: + return None + + title = "HFE Composite" + findings = _present_findings(c282y, h63d, s65c) + if c282y_count == 2: + return _make_insight( + "hfe", + title, + "HFE composite: C282Y homozygous. This is the classic highest-risk " + "common hemochromatosis pattern in consumer-array data.", + "risk", + findings, + ) + if c282y_count == 1 and (h63d_count or 0) >= 1: + return _make_insight( + "hfe", + title, + "HFE composite: likely C282Y/H63D compound-carrier pattern. This " + "meaningfully raises iron-overload risk versus either variant alone.", + "risk", + findings, + ) + if c282y_count == 1 and (s65c_count or 0) >= 1: + return _make_insight( + "hfe", + title, + "HFE composite: C282Y with S65C. This is weaker than C282Y/H63D but " + "still worth pairing with ferritin and transferrin saturation.", + "warning", + findings, + ) + if (h63d_count or 0) == 2: + return _make_insight( + "hfe", + title, + "HFE composite: H63D homozygous. Penetrance is lower than C282Y, but " + "iron studies are still worth checking if symptoms or family history fit.", + "warning", + findings, + ) + if any((count or 0) >= 1 for count in (c282y_count, h63d_count, s65c_count)): + return _make_insight( + "hfe", + title, + "HFE composite: carrier signal present, but not the highest-risk " + "common genotype combination.", + "warning", + findings, + ) + return _make_insight( + "hfe", + title, + "HFE composite: no common high-risk HFE pattern detected in the markers " + "this file covers.", + "normal", + findings, + ) + + +def _cyp2c19_insight(rsid_map: dict[str, Finding]) -> DerivedInsight | None: + loss = rsid_map.get("rs4244285") + gain = rsid_map.get("rs12248560") + loss_count = _variant_count(loss, "A") + gain_count = _variant_count(gain, "T") + if loss_count is None or gain_count is None: + return None + + if loss_count == 0 and gain_count == 0: + tier = "normal" + phenotype = "likely normal metabolizer" + elif loss_count == 0 and gain_count == 1: + tier = "warning" + phenotype = "likely rapid metabolizer" + elif loss_count == 0 and gain_count == 2: + tier = "warning" + phenotype = "likely ultra-rapid metabolizer" + elif loss_count == 1 and gain_count == 0: + tier = "warning" + phenotype = "likely intermediate metabolizer" + elif loss_count == 2 and gain_count == 0: + tier = "risk" + phenotype = "likely poor metabolizer" + elif loss_count == 1 and gain_count == 1: + tier = "warning" + phenotype = "most consistent with a *2/*17 intermediate-metabolizer pattern" + else: + return None + + summary = ( + f"CYP2C19 composite phenotype: {phenotype}. " + "This is more informative for clopidogrel and SSRI interpretation than " + "reading *2 and *17 independently." + ) + return _make_insight( + "cyp2c19", + "CYP2C19 Composite", + summary, + tier, + _present_findings(loss, gain), + ) + + +def _warfarin_insight(rsid_map: dict[str, Finding]) -> DerivedInsight | None: + cyp2c9_2 = rsid_map.get("rs1799853") + cyp2c9_3 = rsid_map.get("rs1057910") + vkorc1 = rsid_map.get("rs9923231") + cyp4f2 = rsid_map.get("rs2108622") + + reduced_2 = _variant_count(cyp2c9_2, "T") + reduced_3 = _variant_count(cyp2c9_3, "C") + sensitive = _variant_count(vkorc1, "T") + offset = _variant_count(cyp4f2, "T") + if reduced_2 is None or reduced_3 is None or sensitive is None: + return None + + reduced_total = reduced_2 + reduced_3 + if reduced_total >= 2 and sensitive >= 1: + tier = "risk" + detail = ( + "Warfarin composite: strong sensitivity signal. PGx-guided dosing is " + "strongly indicated." + ) + elif reduced_total >= 1 and sensitive >= 1: + tier = "risk" + detail = ( + "Warfarin composite: both metabolism and target sensitivity markers " + "point toward a lower starting dose." + ) + elif reduced_total >= 2 or sensitive == 2: + tier = "risk" + detail = "Warfarin composite: one major sensitivity pathway is strongly shifted." + elif reduced_total >= 1 or sensitive >= 1: + tier = "warning" + detail = "Warfarin composite: a moderate dose-reduction signal is present." + elif offset and offset >= 1: + tier = "warning" + detail = "Warfarin composite: CYP4F2 suggests a modestly higher dose may be required." + else: + tier = "normal" + detail = ( + "Warfarin composite: no common low-dose pattern was detected in the " + "markers this file covers." + ) + + if offset and offset >= 1 and tier in {"warning", "risk"}: + detail += ( + " CYP4F2 may modestly offset some of that sensitivity by nudging dose " + "requirements upward." + ) + + return _make_insight( + "warfarin", + "Warfarin Composite", + detail, + tier, + _present_findings(cyp2c9_2, cyp2c9_3, vkorc1, cyp4f2), + ) + + +def _mthfr_insight(rsid_map: dict[str, Finding]) -> DerivedInsight | None: + c677t = rsid_map.get("rs1801133") + a1298c = rsid_map.get("rs1801131") + c677t_count = _variant_count(c677t, "T") + a1298c_count = _variant_count(a1298c, "C") + if c677t_count is None or a1298c_count is None: + return None + + if c677t_count == 2: + tier = "risk" + detail = ( + "MTHFR composite: C677T homozygous. This is the strongest common " + "MTHFR signal in consumer arrays." + ) + elif c677t_count == 1 and a1298c_count == 1: + tier = "warning" + detail = ( + "MTHFR composite: likely compound heterozygous C677T/A1298C pattern, " + "which can matter more than either single heterozygous call alone." + ) + elif c677t_count == 1 or a1298c_count >= 1: + tier = "warning" + detail = "MTHFR composite: one reduced-function common variant is present." + else: + tier = "normal" + detail = ( + "MTHFR composite: no common reduced-function pattern detected in the " + "markers this file covers." + ) + + return _make_insight( + "mthfr", + "MTHFR Composite", + detail, + tier, + _present_findings(c677t, a1298c), + ) + + +def _build_derived_insights(findings: list[Finding]) -> list[DerivedInsight]: + rsid_map = { + finding.rsid: finding + for finding in findings + if finding.call_status in {"called", "no_call", "ambiguous"} + } + builders = ( + _apoe_insight, + _hfe_insight, + _cyp2c19_insight, + _warfarin_insight, + _mthfr_insight, + ) + insights = [insight for builder in builders if (insight := builder(rsid_map)) is not None] + insights.sort(key=lambda item: (-_TIER_RANK[item.tier], item.title)) + return insights + + +def _build_panel_summaries(findings: list[Finding], panels: list[Panel]) -> list[PanelSummary]: + findings_by_panel: dict[str, list[Finding]] = defaultdict(list) + for finding in findings: + findings_by_panel[finding.panel_id].append(finding) + + summaries: list[PanelSummary] = [] + for panel in panels: + if panel.id not in findings_by_panel: + continue + + items = findings_by_panel[panel.id] + total = len(items) + called_count = sum(item.call_status == "called" for item in items) + not_tested_count = sum(item.call_status == "not_tested" for item in items) + no_call_count = sum(item.call_status == "no_call" for item in items) + ambiguous_count = sum(item.call_status == "ambiguous" for item in items) + risk_count = sum( + item.tier == "risk" and item.call_status == "called" for item in items + ) + warning_count = sum( + item.tier == "warning" and item.call_status == "called" for item in items + ) + normal_count = sum( + item.tier == "normal" and item.call_status == "called" for item in items + ) + + summaries.append( + PanelSummary( + panel_id=panel.id, + panel_name=panel.name, + total_snps=total, + called_count=called_count, + not_tested_count=not_tested_count, + no_call_count=no_call_count, + ambiguous_count=ambiguous_count, + risk_count=risk_count, + warning_count=warning_count, + normal_count=normal_count, + assessed_pct=int(round((called_count / total) * 100)) if total else 0, + ) + ) + return summaries + + +def _build_gene_summaries( + findings: list[Finding], + derived_insights: list[DerivedInsight], + panels_by_id: dict[str, str], +) -> list[GeneSummary]: + derived_by_gene: dict[str, list[DerivedInsight]] = defaultdict(list) + for insight in derived_insights: + for gene in insight.genes: + derived_by_gene[gene].append(insight) + + groups: dict[str, list[Finding]] = defaultdict(list) + for finding in findings: + if finding.call_status == "called": + groups[finding.gene].append(finding) + + summaries: list[GeneSummary] = [] + for gene, items in groups.items(): + insights = derived_by_gene.get(gene, []) + multi_signal = ( + len(items) > 1 + or len({item.panel_id for item in items}) > 1 + or bool(insights) + ) + if not multi_signal: + continue + + top_tier = _top_tier( + *[item.tier for item in items], + *[insight.tier for insight in insights], + ) + summary = insights[0].summary if insights else " ".join(_unique_notes(items)[:2]) + summaries.append( + GeneSummary( + gene=gene, + tier=top_tier, + summary=summary, + panels=_unique_sorted([panels_by_id[item.panel_id] for item in items]), + rsids=_unique_sorted([item.rsid for item in items]), + caveat=GENE_CAVEATS.get(gene), + ) + ) + + summaries.sort(key=lambda item: (-_TIER_RANK[item.tier], item.gene)) + return summaries + + +def build_analysis_summary(findings: list[Finding], panels: list[Panel]) -> AnalysisSummary: + panels_by_id = {panel.id: panel.name for panel in panels} + derived_insights = _build_derived_insights(findings) + return AnalysisSummary( + panel_summaries=_build_panel_summaries(findings, panels), + derived_insights=derived_insights, + gene_summaries=_build_gene_summaries(findings, derived_insights, panels_by_id), + ) diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index e46fa9c..b532530 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -54,10 +54,14 @@ def test_analyze_handles_reverse_complement_genotype() -> None: hit = next(f for f in findings if f.rsid == "rs1801133") assert hit.tier == "normal" # GG → RC → CC → normal assert hit.genotype == "GG" # preserve the raw genotype in the finding + assert hit.interpreted_genotype == "CC" + assert hit.match_method == "reverse_complement" + assert hit.confidence_label == "high" findings = analyze({"rs1801133": "AA"}, panels) hit = next(f for f in findings if f.rsid == "rs1801133") assert hit.tier == "risk" # AA → RC → TT → risk + assert hit.interpreted_genotype == "TT" def test_analyze_skips_reverse_complement_for_palindromic_sites() -> None: @@ -88,3 +92,13 @@ def test_analyze_skips_reverse_complement_for_palindromic_sites() -> None: findings = analyze({"rs9000001": "CC"}, [palindromic_panel]) assert len(findings) == 1 assert findings[0].tier == "unknown" + + +def test_analyze_marks_no_calls_explicitly() -> None: + panels = [p for p in load_panels() if p.id == "methylation"] + findings = analyze({"rs1801133": "--"}, panels) + hit = next(f for f in findings if f.rsid == "rs1801133") + assert hit.call_status == "no_call" + assert hit.match_method == "no_call" + assert hit.confidence_score == 0 + assert hit.tier == "unknown" diff --git a/tests/test_cli.py b/tests/test_cli.py index 0fe20d8..d4c19b9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,6 +26,8 @@ def test_cmd_scan_writes_html_and_json_alongside_input(tmp_path: Path) -> None: assert html.exists() and js.exists() payload = json.loads(js.read_text()) assert payload["findings_count"] >= 2 + assert payload["source_file"]["unique_rsid_count"] == 2 + assert "analysis_summary" in payload def test_cmd_update_db_prints_status(capsys) -> None: diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 8625e97..c946897 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -19,6 +19,8 @@ def test_full_pipeline_rule_based(fixtures_dir: Path) -> None: body = resp.json() assert body["report_html"].startswith("") payload = body["report_json"] + assert payload["source_file"]["vendor"] == "23andMe" + assert "analysis_summary" in payload rsids = {f["rsid"] for f in payload["findings"] if f["genotype"]} assert "rs1801133" in rsids @@ -30,6 +32,10 @@ def test_full_pipeline_rule_based(fixtures_dir: Path) -> None: assert tpmt["tier"] == "risk" assert tpmt["pharmgkb"] is not None + derived_ids = {item["id"] for item in payload["analysis_summary"]["derived_insights"]} + assert "mthfr" in derived_ids + assert "apoe" in derived_ids + def test_full_pipeline_with_mocked_llm(fixtures_dir: Path) -> None: client = TestClient(app) diff --git a/tests/test_panels.py b/tests/test_panels.py index e12fc99..4b3ca17 100644 --- a/tests/test_panels.py +++ b/tests/test_panels.py @@ -36,6 +36,15 @@ def test_methylation_panel_has_known_snps() -> None: assert "rs4680" in rsids # COMT +def test_expanded_panels_include_new_high_signal_markers() -> None: + panels = {p.id: p for p in load_panels()} + cardio_rsids = {s.rsid for s in panels["cardiovascular"].snps} + pgx_rsids = {s.rsid for s in panels["pharmacogenomics"].snps} + assert "rs6025" in cardio_rsids # F5 Factor V Leiden + assert "rs1799963" in cardio_rsids # F2 prothrombin G20210A + assert "rs2108622" in pgx_rsids # CYP4F2 warfarin modifier + + def test_snp_interpretations_are_indexed_by_genotype() -> None: panels = {p.id: p for p in load_panels()} mthfr = next(s for s in panels["methylation"].snps if s.rsid == "rs1801133") diff --git a/tests/test_parser.py b/tests/test_parser.py index 6203b87..884e2e8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,6 @@ from pathlib import Path -from opendna.parser import parse_23andme +from opendna.parser import parse_23andme, parse_source_file def test_parse_23andme_skips_comments_and_blanks(fixtures_dir: Path) -> None: @@ -32,3 +32,32 @@ def test_parse_23andme_preserves_no_call_genotype(fixtures_dir: Path) -> None: def test_parse_23andme_accepts_string_path(fixtures_dir: Path) -> None: result = parse_23andme(str(fixtures_dir / "sample_23andme.txt")) assert "rs4680" in result + + +def test_parse_source_file_extracts_vendor_build_and_qc(fixtures_dir: Path) -> None: + parsed = parse_source_file(fixtures_dir / "sample_23andme.txt") + assert parsed.source.vendor == "23andMe" + assert parsed.source.build == "GRCh37" + assert parsed.source.unique_rsid_count == 11 + assert parsed.source.no_call_count == 1 + assert any(issue.code == "small-file" for issue in parsed.source.issues) + assert any("consumer array" in note.lower() for note in parsed.source.blind_spots) + + +def test_parse_source_file_tracks_duplicates_and_ambiguous_calls(tmp_path: Path) -> None: + path = tmp_path / "dna.txt" + path.write_text( + "# generated by MyHeritage\n" + "# build 38\n" + "rs1\t1\t100\tAA\n" + "rs1\t1\t100\tAG\n" + "rs2\t1\t101\tDD\n" + "bad-row\n" + ) + parsed = parse_source_file(path) + assert parsed.source.vendor == "MyHeritage" + assert parsed.source.build == "GRCh38" + assert parsed.genotypes["rs1"] == "AG" + assert parsed.source.duplicate_rsid_count == 1 + assert parsed.source.ambiguous_call_count == 1 + assert parsed.source.malformed_row_count == 1 diff --git a/tests/test_report.py b/tests/test_report.py index e05ec86..7cf4358 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -1,7 +1,13 @@ import json +from pathlib import Path +from opendna.analyzer import analyze +from opendna.annotations import annotate, load_clinvar, load_pharmgkb from opendna.models import Finding +from opendna.panels import load_panels +from opendna.parser import parse_source_file from opendna.report import render_report +from opendna.summaries import build_analysis_summary def _sample_findings() -> list[Finding]: @@ -80,3 +86,15 @@ def test_render_report_escapes_html_in_prose() -> None: bundle = render_report(_sample_findings(), llm_prose="") assert "