From cbf75387a984b81c8c5830e7f5845a742b975348 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 01:52:14 +0400 Subject: [PATCH 001/102] First new outlines-partial --- extraction/config/pipeline.json | 43 + extraction/hyb_db.py | 875 +++++++++++++++++++++ extraction/passes/A_core/prompt.txt | 15 + extraction/passes/A_core/schema.json | 22 + extraction/passes/B_index/prompt.txt | 17 + extraction/passes/B_index/schema.json | 34 + extraction/passes/C_sequences/prompt.txt | 16 + extraction/passes/C_sequences/schema.json | 117 +++ extraction/passes/D_parameters/prompt.txt | 16 + extraction/passes/D_parameters/schema.json | 109 +++ extraction/passes/E_outcomes/prompt.txt | 14 + extraction/passes/E_outcomes/schema.json | 46 ++ extraction/passes/F_pairings/prompt.txt | 14 + extraction/passes/F_pairings/schema.json | 33 + extraction/passes/common.txt | 13 + extraction/schemas/full.json | 514 ++++++++++++ 16 files changed, 1898 insertions(+) create mode 100644 extraction/config/pipeline.json create mode 100644 extraction/hyb_db.py create mode 100644 extraction/passes/A_core/prompt.txt create mode 100644 extraction/passes/A_core/schema.json create mode 100644 extraction/passes/B_index/prompt.txt create mode 100644 extraction/passes/B_index/schema.json create mode 100644 extraction/passes/C_sequences/prompt.txt create mode 100644 extraction/passes/C_sequences/schema.json create mode 100644 extraction/passes/D_parameters/prompt.txt create mode 100644 extraction/passes/D_parameters/schema.json create mode 100644 extraction/passes/E_outcomes/prompt.txt create mode 100644 extraction/passes/E_outcomes/schema.json create mode 100644 extraction/passes/F_pairings/prompt.txt create mode 100644 extraction/passes/F_pairings/schema.json create mode 100644 extraction/passes/common.txt create mode 100644 extraction/schemas/full.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json new file mode 100644 index 0000000..3f4f7ae --- /dev/null +++ b/extraction/config/pipeline.json @@ -0,0 +1,43 @@ +{ + "model_name": "myaniu/qwen2.5-1m:7b", + "num_ctx": 131072, + "num_predict": 65536, + "timeout_s": 1800, + "input_dir": "outputs/text", + "out_dir": "outlines_output", + "full_schema_path": "schema/json/article.json", + "db_path": "outlines_output/massive.sqlite", + "article_glob": "input/txt*.txt", + "passes": [ + { + "name": "A_core", + "schema": "passes/A_core/schema.json", + "prompt": "passes/A_core/prompt.txt" + }, + { + "name": "B_index", + "schema": "passes/B_index/schema.json", + "prompt": "passes/B_index/prompt.txt" + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt" + }, + { + "name": "D_parameters", + "schema": "passes/D_parameters/schema.json", + "prompt": "passes/D_parameters/prompt.txt" + }, + { + "name": "E_outcomes", + "schema": "passes/E_outcomes/schema.json", + "prompt": "passes/E_outcomes/prompt.txt" + }, + { + "name": "F_pairings", + "schema": "passes/F_pairings/schema.json", + "prompt": "passes/F_pairings/prompt.txt" + } + ] +} \ No newline at end of file diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py new file mode 100644 index 0000000..de3b0c9 --- /dev/null +++ b/extraction/hyb_db.py @@ -0,0 +1,875 @@ +# -*- coding: utf-8 -*- +import outlines +from outlines.types import JsonSchema +import ollama +import re +from typing import Optional, Tuple, Dict, Any, List +import json +from pathlib import Path +from tqdm import tqdm +#from __future__ import annotations +import sqlite3 +from contextlib import contextmanager +from datetime import datetime, timezone +from loguru import logger +from ollama import chat, ChatResponse +from json_repair import repair_json +import os, sys +from jsonschema import Draft202012Validator + +# -*- coding: utf-8 -*- +""" +SQLite dataset builder for hybridization-article extractions. + +Public API: + init_db(db_path) + insert_article_object(db_path, article_obj, model_name, article_name) + +Features: +- Auto-initializes schema (tables, indexes, views). +- Preserves every run (no overwrites). +- Normalizes sense/antisense & prime markers. +- Guards against non-oligo "probes" (skips probe insertion but keeps experiment). +- Includes Ollama-style helper tools with Google docstrings. +""" +import json +import re +import sqlite3 +from contextlib import contextmanager +from datetime import datetime, timezone +from typing import Any, Dict, Optional, Tuple, List + + +@contextmanager +def _db(db_path: str): + """Context manager for SQLite connection with FK + WAL enabled.""" + conn = sqlite3.connect(db_path) + try: + conn.execute("PRAGMA foreign_keys = ON;") + conn.execute("PRAGMA journal_mode = WAL;") + yield conn + conn.commit() + finally: + conn.close() + + +# ----------------------------- Schema DDL ----------------------------- # + +_TABLES_AND_INDEXES_SQL = """ +CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY, + doi TEXT NOT NULL UNIQUE, + latest_article_name TEXT, + latest_abstract TEXT, + latest_topic TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY, + article_id INTEGER NOT NULL, + model_name TEXT NOT NULL, + article_name TEXT, + branch TEXT NOT NULL CHECK (branch IN ('experiments','no_sequences')), + created_at TEXT NOT NULL, + FOREIGN KEY (article_id) REFERENCES articles(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_runs_article ON runs(article_id); +CREATE INDEX IF NOT EXISTS idx_runs_created ON runs(created_at); +CREATE INDEX IF NOT EXISTS idx_runs_model ON runs(model_name); + +CREATE TABLE IF NOT EXISTS raw_payloads ( + run_id INTEGER PRIMARY KEY, + json TEXT NOT NULL, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS experiments ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + id_exp TEXT NOT NULL, + type TEXT, + description TEXT NOT NULL, + raw_description TEXT, + organism TEXT, + technology TEXT, + annealing_qualitative INTEGER, -- NULL/0/1 + rna_impurities_qualitative INTEGER, -- NULL/0/1 + UNIQUE (run_id, id_exp), + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_experiments_run ON experiments(run_id); +CREATE INDEX IF NOT EXISTS idx_experiments_idexp ON experiments(id_exp); + +CREATE TABLE IF NOT EXISTS oligos ( + id INTEGER PRIMARY KEY, + raw TEXT NOT NULL, + sequence TEXT, + length_bases INTEGER, + prime_prefix INTEGER CHECK (prime_prefix IN (3,5)), + five_prime_label TEXT, + three_prime_label TEXT, + sense_antisense TEXT CHECK (sense_antisense IN ('sense','antisense')), + provenance_source_type TEXT, + provenance_page INTEGER, + provenance_section TEXT, + provenance_quote TEXT, + provenance_notes TEXT +); +CREATE INDEX IF NOT EXISTS idx_oligos_seq ON oligos(sequence); + +CREATE TABLE IF NOT EXISTS probes ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + name TEXT NOT NULL, + amplicon_id TEXT, + oligo_id INTEGER NOT NULL, + fluorophore TEXT, + quencher TEXT, + sense_antisense TEXT CHECK (sense_antisense IN ('sense','antisense')), + notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_probes_name ON probes(name); +CREATE INDEX IF NOT EXISTS idx_probes_exp ON probes(experiment_id); + +CREATE TABLE IF NOT EXISTS target_sequences ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + oligo_id INTEGER NOT NULL, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); + +CREATE TABLE IF NOT EXISTS primer_pairs ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + forward_oligo_id INTEGER NOT NULL, + reverse_oligo_id INTEGER NOT NULL, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (forward_oligo_id) REFERENCES oligos(id), + FOREIGN KEY (reverse_oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_primers_exp ON primer_pairs(experiment_id); + +CREATE TABLE IF NOT EXISTS related_sequences ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + oligo_id INTEGER NOT NULL, + description TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_relseqs_exp ON related_sequences(experiment_id); + +CREATE TABLE IF NOT EXISTS outcomes ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + outcome INTEGER, -- NULL/0/1 + comparative_notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_outcomes_exp ON outcomes(experiment_id); + +CREATE TABLE IF NOT EXISTS measurements ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + key TEXT NOT NULL, + raw TEXT NOT NULL, + value REAL, + unit TEXT, + si_value REAL, + si_unit TEXT, + assumptions TEXT, + provenance_source_type TEXT, + provenance_page INTEGER, + provenance_section TEXT, + provenance_quote TEXT, + provenance_notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_measurements_exp_key ON measurements(experiment_id, key); + +CREATE TABLE IF NOT EXISTS pairings ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + paired_with_probe_name TEXT, + relationship TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_pairings_exp ON pairings(experiment_id); + +CREATE TABLE IF NOT EXISTS extraction_report_entries ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + experiment_id INTEGER, + kind TEXT NOT NULL CHECK (kind IN ('missing','uncertain')), + json_pointer TEXT NOT NULL, + notes TEXT, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_report_run_kind ON extraction_report_entries(run_id, kind); + +CREATE TABLE IF NOT EXISTS no_sequences_explanations ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + explanation TEXT NOT NULL, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_no_seq_run ON no_sequences_explanations(run_id); +""" + +_VIEWS_SQL = """ +CREATE VIEW IF NOT EXISTS view_experiments_flat AS +SELECT + a.doi AS doi, + r.model_name AS model_name, + r.article_name AS article_name, + r.created_at AS run_created_at, + e.id AS experiment_id, + e.id_exp AS id_exp, + e.type AS exp_type, + e.description AS exp_description, + e.organism AS organism, + e.technology AS technology, + p.name AS probe_name, + p.amplicon_id AS amplicon_id, + p.fluorophore AS probe_fluorophore, + p.quencher AS probe_quencher, + po.sequence AS probe_sequence, + po.five_prime_label AS probe_5p_label, + po.three_prime_label AS probe_3p_label, + tgo.sequence AS target_sequence, + o.outcome AS outcome_bool, + o.comparative_notes AS outcome_notes +FROM experiments e +JOIN runs r ON r.id = e.run_id +JOIN articles a ON a.id = r.article_id +LEFT JOIN probes p ON p.experiment_id = e.id +LEFT JOIN oligos po ON po.id = p.oligo_id +LEFT JOIN target_sequences ts ON ts.experiment_id = e.id +LEFT JOIN oligos tgo ON tgo.id = ts.oligo_id +LEFT JOIN outcomes o ON o.experiment_id = e.id; + +CREATE VIEW IF NOT EXISTS view_measurements_flat AS +SELECT + a.doi, + r.model_name, + r.article_name, + r.created_at AS run_created_at, + e.id AS experiment_id, + e.id_exp, + m.key, + m.raw, + m.value, + m.unit, + m.si_value, + m.si_unit, + m.assumptions +FROM measurements m +JOIN experiments e ON e.id = m.experiment_id +JOIN runs r ON r.id = e.run_id +JOIN articles a ON a.id = r.article_id; +""" + + +def _ensure_schema(conn: sqlite3.Connection) -> None: + """Create all tables, indexes, and views if they don't exist.""" + cur = conn.cursor() + cur.executescript(_TABLES_AND_INDEXES_SQL) + cur.executescript(_VIEWS_SQL) + conn.commit() + + +# ----------------------------- Public API ----------------------------- # + +def init_db(db_path: str) -> None: + """Create (if not exists) the SQLite database schema, indices, and views. + + Args: + db_path: Path to the SQLite file. Created if it doesn't exist. + """ + with _db(db_path) as conn: + _ensure_schema(conn) + + +def insert_article_object(db_path: str, article_obj: Dict[str, Any], + model_name: str, article_name: Optional[str]) -> int: + """Insert a schema-conformant JSON object into the SQLite DB. + + Auto-creates the DB schema if missing. Preserves every run. + + Args: + db_path: SQLite file path. + article_obj: Dict that conforms to the Hybridization Article schema. + model_name: Model identifier (e.g., 'Qwen2.5-Instruct-1M:14b'). + article_name: Name/key for the source file processed. + + Returns: + run_id (int) for this insertion. + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + + doi = article_obj.get("doi") + if not doi: + raise ValueError("Input must contain a top-level 'doi' string.") + + has_experiments = isinstance(article_obj.get("experiments"), list) + branch = "experiments" if has_experiments else "no_sequences" + + article_id = _get_or_create_article( + cur, + doi=doi, + article_name=article_name or article_obj.get("article_name"), + abstract=article_obj.get("abstract"), + topic=article_obj.get("topic"), + ) + + run_id = _create_run(cur, article_id, model_name, article_name, branch, raw_json=article_obj) + + # Top-level extraction report (if any) + _insert_extraction_report(cur, run_id, article_obj.get("extraction_report"), experiment_id=None) + + if branch == "no_sequences": + explanation = article_obj.get("explanation_why_does_not_this_article_have_any_hybridization_probes_sequences") or "" + cur.execute( + "INSERT INTO no_sequences_explanations (run_id, explanation) VALUES (?, ?)", + (run_id, explanation), + ) + return run_id + + # ---- experiments branch ---- + for exp in (article_obj.get("experiments") or []): + id_exp = exp.get("id_exp") + desc = exp.get("description") or "" + cur.execute( + """ + INSERT INTO experiments + (run_id, id_exp, type, description, raw_description, + organism, technology, annealing_qualitative, rna_impurities_qualitative) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + run_id, + id_exp, + exp.get("type"), + desc, + exp.get("raw_description"), + (exp.get("metadata") or {}).get("organism"), + (exp.get("metadata") or {}).get("technology"), + _to_int_bool(((exp.get("metadata") or {}).get("annealing") or {}).get("qualitative")), + _to_int_bool(((exp.get("metadata") or {}).get("rna_impurities") or {}).get("qualitative")), + ), + ) + experiment_id = cur.lastrowid + + # Per-experiment extraction report + _insert_extraction_report(cur, run_id, exp.get("extraction_report"), experiment_id=experiment_id) + + # Sequences + seqs = exp.get("sequences") or {} + + # Validate the probe looks like a real oligo + probe = seqs.get("probe") or {} + if not _has_real_probe(probe): + # Record and skip probe insertion, but keep experiment row and any metadata/measurements/outcomes + _insert_extraction_report( + cur, run_id, + {"missing": ["/experiments/*/sequences/probe/oligo/sequence"], + "notes": "Rejected probable non-oligo probe (no bases/labels/length)."}, + experiment_id=experiment_id, + ) + else: + # Target (optional) + tgt = seqs.get("target_sequence") + if isinstance(tgt, dict) and (tgt.get("raw") is not None): + tgt_oligo_id = _insert_oligo(cur, tgt) + cur.execute( + "INSERT INTO target_sequences (experiment_id, oligo_id) VALUES (?, ?)", + (experiment_id, tgt_oligo_id), + ) + + # Probe (required by schema; normalized before insert) + probe_oligo = probe.get("oligo") or {} + probe_oligo_id = _insert_oligo(cur, probe_oligo) + sa = _coerce_sa(probe.get("sense_antisense"), probe.get("name")) + cur.execute( + """ + INSERT INTO probes + (experiment_id, name, amplicon_id, oligo_id, fluorophore, quencher, sense_antisense, notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + experiment_id, + probe.get("name"), + probe.get("amplicon_id"), + probe_oligo_id, + probe.get("fluorophore"), + probe.get("quencher"), + sa, + probe.get("notes"), + ), + ) + + # Primers (optional) + primers = seqs.get("primer_sequences") + if isinstance(primers, dict): + fwd = primers.get("forward") or {} + rev = primers.get("reverse") or {} + fwd_id = _insert_oligo(cur, fwd) + rev_id = _insert_oligo(cur, rev) + cur.execute( + "INSERT INTO primer_pairs (experiment_id, forward_oligo_id, reverse_oligo_id) VALUES (?, ?, ?)", + (experiment_id, fwd_id, rev_id), + ) + + # Related sequences (0..N) + for rs in (seqs.get("related_sequences") or []): + r_oligo = rs.get("related_sequence") + if isinstance(r_oligo, dict) and (r_oligo.get("raw") is not None): + r_oligo_id = _insert_oligo(cur, r_oligo) + cur.execute( + "INSERT INTO related_sequences (experiment_id, oligo_id, description) VALUES (?, ?, ?)", + (experiment_id, r_oligo_id, rs.get("description")), + ) + + # Measurements (experiment_properties + metadata) + exprops = exp.get("experiment_properties") or {} + concs = (exprops.get("concentrations") or {}) + _insert_measurement(cur, experiment_id, "experiment_properties.concentrations.dna_rna_concentration", + concs.get("dna_rna_concentration")) + _insert_measurement(cur, experiment_id, "experiment_properties.concentrations.concentration_SI", + concs.get("concentration_SI")) + + params = (exprops.get("parameters_SI") or {}) + for key in ("temperature", "Tris", "Na", "K", "Mg", "DMSO"): + _insert_measurement(cur, experiment_id, f"experiment_properties.parameters_SI.{key}", params.get(key)) + + meta = exp.get("metadata") or {} + _insert_measurement(cur, experiment_id, "metadata.pH", meta.get("pH")) + ann = meta.get("annealing") or {} + _insert_measurement(cur, experiment_id, "metadata.annealing.quantitative", ann.get("quantitative")) + rimp = meta.get("rna_impurities") or {} + _insert_measurement(cur, experiment_id, "metadata.rna_impurities.quantitative", rimp.get("quantitative")) + + # Outcome + out = exp.get("outcome") or {} + cur.execute( + "INSERT INTO outcomes (experiment_id, outcome, comparative_notes) VALUES (?, ?, ?)", + (experiment_id, _to_int_bool(out.get("outcome")), out.get("comparative_notes")), + ) + _insert_measurement(cur, experiment_id, "outcome.fluorescence", out.get("fluorescence")) + + # Pairing (optional) + pair = exp.get("pairing") or {} + if pair.get("paired_with_probe_name") or pair.get("relationship"): + cur.execute( + "INSERT INTO pairings (experiment_id, paired_with_probe_name, relationship) VALUES (?, ?, ?)", + (experiment_id, pair.get("paired_with_probe_name"), pair.get("relationship")), + ) + + return run_id + + +# ----------------------------- Ollama-style helper tools ----------------------------- # + +def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: + """Convert a numeric value and unit to SI. + + Supports common units from hybridization papers: + - Temperature: °C -> K (K = °C + 273.15), K stays K. + - Concentration: M, mM, µM/um, nM -> mol/m^3 (1 mM = 1 mol/m^3). + - Percent: % -> dimensionless fraction (value/100). + + Args: + value: The numeric value parsed from the article, or None if unknown. + unit: The unit string as written in the article (e.g., '°C', 'C', 'mM', 'µM', 'nM', '%'), or None. + + Returns: + A pair (si_value, si_unit): + - si_value: The value converted to SI, or None if not convertible. + - si_unit: The SI unit string ('K', 'mol/m^3', 'dimensionless'), or None if not convertible. + + Examples: + >>> to_si(25, '°C') + (298.15, 'K') + >>> to_si(2, 'mM') + (2.0, 'mol/m^3') + >>> to_si(10, '%') + (0.1, 'dimensionless') + """ + if value is None or unit is None: + return None, None + + u = unit.strip().lower().replace('µ', 'u') + # Temperature + if u in {'c', '°c', 'deg c', 'celsius'}: + return value + 273.15, 'K' + if u in {'k', 'kelvin'}: + return value, 'K' + + # Concentration (to mol/m^3) + if u in {'m', 'mol/l'}: + return value * 1000.0, 'mol/m^3' + if u in {'mm', 'mmol/l', 'mmol', 'mm'}: # 'mm' for mM as often OCR'd + return value * 1.0, 'mol/m^3' + if u in {'um', 'umol/l', 'µm', 'µmol/l', 'micromolar'}: + return value * 1e-3, 'mol/m^3' + if u in {'nm', 'nmol/l', 'nanomolar'}: + return value * 1e-6, 'mol/m^3' + + # Percent + if u in {'%', 'percent', 'perc'}: + return value / 100.0, 'dimensionless' + + return None, None + + +OLIGO_RE = re.compile(r""" +^\s* +(?:(?P(?:5|3)(?:['′’]|0|O)?)\s*-\s*)? +(?:(?P(?:[A-Za-z0-9+]+-)+))? +(?P[ACGUTRYSWKMBDHVN]+) +(?:(?P(?:-[A-Za-z0-9+]+)+))? +(?:\s*\(\s*(?P\d+)\s*(?:b|bp)\s*\)\s*)? +\s*$ +""", re.X) + +def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: + """Parse a decorated oligo string into schema-ready parts. + + Accepts OCR-prone patterns like "50-FAM-...-BHQ2 (27 b)" and normalizes: + - prime_prefix: 5 or 3 when 5′/3′ (includes 50/5O variants) + - sequence: IUPAC bases (uppercase) + - length_bases: integer if present + - labels: all labels in order; five_prime_label and three_prime_label are the first/last, respectively + + Args: + raw: The exact oligo string from the article (may include labels and length), or None. + + Returns: + A dict matching the 'decoratedOligo' shape (minus provenance): + { + "raw": str or None, + "sequence": str or None, + "length_bases": int or None, + "prime_prefix": 5|3|None, + "five_prime_label": str or None, + "three_prime_label": str or None, + "labels": List[str], + "sense_antisense": None + } + """ + result: Dict[str, Any] = { + "raw": raw, + "sequence": None, + "length_bases": None, + "prime_prefix": None, + "five_prime_label": None, + "three_prime_label": None, + "labels": [], + "sense_antisense": None + } + if not raw: + return result + + m = OLIGO_RE.match(raw) + if not m: + return result + + prime = m.group('prime') + if prime: + result["prime_prefix"] = 5 if prime.startswith('5') else 3 + + seq = m.group('seq') + if seq: + result["sequence"] = seq.upper() + + if m.group('len'): + result["length_bases"] = int(m.group('len')) + + labels: List[str] = [] + if m.group('prefix'): + labels += [x for x in m.group('prefix').split('-') if x] + if m.group('suffix'): + labels += [x for x in m.group('suffix').split('-') if x] + result["labels"] = labels + if labels: + result["five_prime_label"] = labels[0] + result["three_prime_label"] = labels[-1] + + return result + + +def make_measurement(raw: Optional[str], + value: Optional[float] = None, + unit: Optional[str] = None) -> Dict[str, Any]: + """Build a 'measurement' object with SI conversion. + + Convenience helper to populate the schema's measurement type while keeping the raw text. + + Args: + raw: The raw textual measurement from the article (e.g., '58 °C', '2 mM', '10%'). + value: Parsed numeric value, if available. + unit: Parsed unit string as written in the article (e.g., '°C', 'mM', '%'). + + Returns: + A dict with keys: raw, value, unit, si_value, si_unit, assumptions. + Unknown or unsupported units yield si_value/si_unit = None. + """ + si_value, si_unit = to_si(value, unit) if (value is not None and unit is not None) else (None, None) + return { + "raw": raw or "", + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None + } + + +# ----------------------------- Normalization / validation helpers ----------------------------- # + +_SA_MAP = { + 's': 'sense', + 'sense': 'sense', + 'as': 'antisense', + 'antisense': 'antisense', + '+': 'sense', + '-': 'antisense', + 'forward': 'sense', + 'reverse': 'antisense', +} +_SA_NAME_RE = re.compile(r"\)\s*(as|s)\s*$", re.IGNORECASE) + +def _detect_sa_from_name(probe_name: Optional[str]) -> Optional[str]: + """Infer sense/antisense from a trailing '(...)s' or '(...)as' in the probe name. + + Args: + probe_name: Probe name (e.g., 'N3-FAM(27)s'). + + Returns: + 'sense', 'antisense', or None if not inferable. + """ + if not probe_name: + return None + m = _SA_NAME_RE.search(probe_name.strip()) + if not m: + return None + g = m.group(1).lower() + return 'antisense' if g == 'as' else 'sense' + + +def _coerce_sa(value: Optional[str], probe_name: Optional[str] = None) -> Optional[str]: + """Coerce various encodings to 'sense'/'antisense'/None. + + Args: + value: A string like 's', 'as', 'sense', 'antisense', '+', '-', or None. + probe_name: Fallback context to infer sense/antisense from the name suffix. + + Returns: + 'sense', 'antisense', or None. + """ + if value is None or (isinstance(value, str) and not value.strip()): + return _detect_sa_from_name(probe_name) + v = str(value).strip().lower() + if v in _SA_MAP: + return _SA_MAP[v] + return _detect_sa_from_name(probe_name) + + +def _coerce_prime_prefix(value: Any) -> Optional[int]: + """Clamp prime prefix to {3, 5} or None. + + Handles OCR-like strings such as '5', "5'", '50', '5O', '5′'. + + Args: + value: Raw input for prime prefix. + + Returns: + 3, 5, or None. + """ + if value is None: + return None + s = str(value).strip() + if s.startswith('5'): + return 5 + if s.startswith('3'): + return 3 + return None + + +def _has_real_probe(probe: Dict[str, Any]) -> bool: + """Heuristic gate: reject obviously non-oligo 'probes'. + + Accepts a probe only if at least one of these holds: + - >= 6 IUPAC bases appear in oligo.sequence or oligo.raw + - a known label is present (FAM/ROX/Cy5/BHQ1/BHQ2/RTQ1) in labels/five/three + - length_bases is present + + Args: + probe: The 'probe' dict from the schema. + + Returns: + True if looks like a real oligo; False otherwise. + """ + if not isinstance(probe, dict): + return False + oligo = probe.get("oligo") or {} + raw = (oligo.get("raw") or "") + seq = (oligo.get("sequence") or "") + has_bases = bool(re.search(r"[ACGUTRYSWKMBDHVN]{6,}", (seq or raw).upper())) + has_label = any(bool(oligo.get(k)) for k in ("five_prime_label", "three_prime_label")) \ + or bool(oligo.get("labels")) + has_length = bool(oligo.get("length_bases")) + return has_bases or has_label or has_length + + +# ----------------------------- DB helpers ----------------------------- # + +def _utcnow_iso() -> str: + """UTC timestamp in ISO8601 format.""" + return datetime.now(timezone.utc).isoformat() + + +def _get_or_create_article(cur: sqlite3.Cursor, doi: str, + article_name: Optional[str], + abstract: Optional[str], + topic: Optional[str]) -> int: + """Fetch article.id by DOI, creating the row if needed (and refreshing metadata).""" + cur.execute("SELECT id FROM articles WHERE doi = ?", (doi,)) + row = cur.fetchone() + if row: + article_id = row[0] + cur.execute( + """ + UPDATE articles + SET latest_article_name = COALESCE(?, latest_article_name), + latest_abstract = COALESCE(?, latest_abstract), + latest_topic = COALESCE(?, latest_topic) + WHERE id = ? + """, + (article_name, abstract, topic, article_id), + ) + return article_id + cur.execute( + """ + INSERT INTO articles (doi, latest_article_name, latest_abstract, latest_topic, created_at) + VALUES (?, ?, ?, ?, ?) + """, + (doi, article_name, abstract, topic, _utcnow_iso()), + ) + return cur.lastrowid + + +def _create_run(cur: sqlite3.Cursor, article_id: int, model_name: str, + article_name: Optional[str], branch: str, + raw_json: Dict[str, Any]) -> int: + """Create a run row and persist the raw JSON payload.""" + cur.execute( + """ + INSERT INTO runs (article_id, model_name, article_name, branch, created_at) + VALUES (?, ?, ?, ?, ?) + """, + (article_id, model_name, article_name, branch, _utcnow_iso()), + ) + run_id = cur.lastrowid + cur.execute("INSERT INTO raw_payloads (run_id, json) VALUES (?, ?)", + (run_id, json.dumps(raw_json, ensure_ascii=False))) + return run_id + + +def _insert_provenance_cols(entity: Dict[str, Any]) -> Tuple[Optional[str], Optional[int], Optional[str], Optional[str], Optional[str]]: + """Extract provenance fields with safe defaults.""" + prov = entity.get("provenance") or {} + return ( + prov.get("source_type"), + prov.get("page"), + prov.get("section"), + prov.get("quote"), + prov.get("notes"), + ) + + +def _insert_oligo(cur: sqlite3.Cursor, oligo: Dict[str, Any]) -> int: + """Insert an oligo row after normalizing prime_prefix and sense/antisense.""" + cleaned = dict(oligo or {}) + cleaned["prime_prefix"] = _coerce_prime_prefix(cleaned.get("prime_prefix")) + cleaned["sense_antisense"] = _coerce_sa(cleaned.get("sense_antisense")) + + ps, pg, sc, qu, no = _insert_provenance_cols(cleaned) + cur.execute( + """ + INSERT INTO oligos + (raw, sequence, length_bases, prime_prefix, + five_prime_label, three_prime_label, sense_antisense, + provenance_source_type, provenance_page, provenance_section, + provenance_quote, provenance_notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + cleaned.get("raw", ""), + cleaned.get("sequence"), + cleaned.get("length_bases"), + cleaned.get("prime_prefix"), + cleaned.get("five_prime_label"), + cleaned.get("three_prime_label"), + cleaned.get("sense_antisense"), + ps, pg, sc, qu, no, + ), + ) + return cur.lastrowid + + +def _insert_measurement(cur: sqlite3.Cursor, experiment_id: int, key: str, m: Optional[Dict[str, Any]]) -> None: + """Insert a measurement if present.""" + if not m: + return + ps, pg, sc, qu, no = _insert_provenance_cols(m) + cur.execute( + """ + INSERT INTO measurements + (experiment_id, key, raw, value, unit, si_value, si_unit, assumptions, + provenance_source_type, provenance_page, provenance_section, provenance_quote, provenance_notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + experiment_id, + key, + (m.get("raw") or ""), + m.get("value"), + m.get("unit"), + m.get("si_value"), + m.get("si_unit"), + m.get("assumptions"), + ps, pg, sc, qu, no, + ), + ) + + +def _insert_extraction_report(cur: sqlite3.Cursor, run_id: int, + report: Optional[Dict[str, Any]], + experiment_id: Optional[int] = None) -> None: + """Insert extraction report entries (missing/uncertain pointers).""" + if not report: + return + for kind in ("missing", "uncertain"): + for ptr in report.get(kind, []) or []: + cur.execute( + """ + INSERT INTO extraction_report_entries (run_id, experiment_id, kind, json_pointer, notes) + VALUES (?, ?, ?, ?, ?) + """, + (run_id, experiment_id, kind, str(ptr), report.get("notes")), + ) + + +def _to_int_bool(val: Optional[bool]) -> Optional[int]: + """Convert Python bool/None -> 1/0/NULL for SQLite.""" + if val is None: + return None + return 1 if bool(val) else 0 diff --git a/extraction/passes/A_core/prompt.txt b/extraction/passes/A_core/prompt.txt new file mode 100644 index 0000000..d13a083 --- /dev/null +++ b/extraction/passes/A_core/prompt.txt @@ -0,0 +1,15 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Extract the article’s doi, abstract, and topic (short label). +* If any is missing, set it to `null` and list in `extraction_report.missing`. diff --git a/extraction/passes/A_core/schema.json b/extraction/passes/A_core/schema.json new file mode 100644 index 0000000..5ff4bb3 --- /dev/null +++ b/extraction/passes/A_core/schema.json @@ -0,0 +1,22 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ArticleCore", + "type": "object", + "additionalProperties": false, + "required": ["doi", "abstract", "topic", "extraction_report"], + "properties": { + "doi": { "type": "string", "minLength": 4, "maxLength": 200 }, + "abstract": { "type": "string", "minLength": 10, "maxLength": 5000 }, + "topic": { "type": "string", "minLength": 2, "maxLength": 200 }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/B_index/prompt.txt b/extraction/passes/B_index/prompt.txt new file mode 100644 index 0000000..d87c72a --- /dev/null +++ b/extraction/passes/B_index/prompt.txt @@ -0,0 +1,17 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B_index/schema.json b/extraction/passes/B_index/schema.json new file mode 100644 index 0000000..f53aa08 --- /dev/null +++ b/extraction/passes/B_index/schema.json @@ -0,0 +1,34 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { "type": ["string", "null"], "minLength": 2, "maxLength": 200 }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/C_sequences/prompt.txt b/extraction/passes/C_sequences/prompt.txt new file mode 100644 index 0000000..0984499 --- /dev/null +++ b/extraction/passes/C_sequences/prompt.txt @@ -0,0 +1,16 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. +* The `oligo_lite.raw` must contain nucleotides and no ellipses. +* Keep labels like FAM/ROX/BHQ2 in the text; if article does not mention them explicitly, leave derived fields `null`. diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json new file mode 100644 index 0000000..4d3c36f --- /dev/null +++ b/extraction/passes/C_sequences/schema.json @@ -0,0 +1,117 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "target_sequence": { "oneOf": [ { "$ref": "#/$defs/oligo_lite" }, { "type": "null" } ] }, + "probe": { "$ref": "#/$defs/probe_lite" }, + "primer_sequences": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { "$ref": "#/$defs/oligo_lite" }, + "reverse": { "$ref": "#/$defs/oligo_lite" } + } + }, + { "type": "null" } + ] + }, + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence"], + "properties": { + "related_sequence": { "$ref": "#/$defs/oligo_lite" }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + }, + "$defs": { + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000 + }, + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "value", "unit"], + "properties": { + "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, + "value": { "type": ["number", "null"] }, + "unit": { "type": ["string", "null"], "maxLength": 50 } + } + }, + "oligo_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "five_prime_label", "three_prime_label", "sense_antisense"], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "pattern": "^[A-Za-z0-9\\-\\(\\)\\[\\]'\"/\\+\\sµ′’]*[ACGTIU][A-Za-z0-9\\-\\(\\)\\[\\]'\"/\\+\\sµ′’]*$", + "description": "Keep exactly as printed; must include at least one nucleotide; no ellipses." + }, + "sequence": { + "$ref": "#/$defs/iupacBases" + }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 30 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 30 }, + "sense_antisense": { + "type": ["string", "null"], + "enum": ["sense", "antisense", null] + } + } + }, + "probe_lite": { + "type": "object", + "additionalProperties": false, + "required": ["name", "oligo", "fluorophore", "quencher", "sense_antisense", "amplicon_id", "notes"], + "properties": { + "name": { "type": "string", "minLength": 2, "maxLength": 200 }, + "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, + "oligo": { "$ref": "#/$defs/oligo_lite" }, + "fluorophore": { "type": ["string", "null"], "maxLength": 40 }, + "quencher": { "type": ["string", "null"], "maxLength": 40 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "notes": { "type": ["string", "null"], "maxLength": 400 } + } + } + } +} diff --git a/extraction/passes/D_parameters/prompt.txt b/extraction/passes/D_parameters/prompt.txt new file mode 100644 index 0000000..fdacda1 --- /dev/null +++ b/extraction/passes/D_parameters/prompt.txt @@ -0,0 +1,16 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract `metadata` and `experiment_properties`. +* Use `measurement_lite` for numeric items: keep raw text and parsed value+unit when clear; otherwise leave numeric fields `null`. +* If not present in the article, use `null` and record the pointer. diff --git a/extraction/passes/D_parameters/schema.json b/extraction/passes/D_parameters/schema.json new file mode 100644 index 0000000..2856c9e --- /dev/null +++ b/extraction/passes/D_parameters/schema.json @@ -0,0 +1,109 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ParametersPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "metadata", "experiment_properties"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "metadata": { + "type": "object", + "additionalProperties": false, + "required": ["organism", "technology", "annealing", "pH", "rna_impurities"], + "properties": { + "organism": { "type": ["string", "null"], "maxLength": 200 }, + "technology": { "type": ["string", "null"], "maxLength": 200 }, + "annealing": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { "$ref": "#/$defs/measurement_lite" }, + "qualitative": { "type": ["boolean", "null"] } + } + }, + { "type": "null" } + ] + }, + "pH": { "$ref": "#/$defs/measurement_lite" }, + "rna_impurities": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { "$ref": "#/$defs/measurement_lite" }, + "qualitative": { "type": ["boolean", "null"] } + } + }, + { "type": "null" } + ] + } + } + }, + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "required": ["concentrations", "parameters_SI"], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "properties": { + "dna_rna_concentration": { "$ref": "#/$defs/measurement_lite" }, + "concentration_SI": { "$ref": "#/$defs/measurement_lite" } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "required": ["temperature", "Tris", "Na", "K", "Mg", "DMSO"], + "properties": { + "temperature": { "$ref": "#/$defs/measurement_lite" }, + "Tris": { "$ref": "#/$defs/measurement_lite" }, + "Na": { "$ref": "#/$defs/measurement_lite" }, + "K": { "$ref": "#/$defs/measurement_lite" }, + "Mg": { "$ref": "#/$defs/measurement_lite" }, + "DMSO": { "$ref": "#/$defs/measurement_lite" } + } + } + } + } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + }, + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "value", "unit"], + "properties": { + "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, + "value": { "type": ["number", "null"] }, + "unit": { "type": ["string", "null"], "maxLength": 50 } + } + } + } +} diff --git a/extraction/passes/E_outcomes/prompt.txt b/extraction/passes/E_outcomes/prompt.txt new file mode 100644 index 0000000..8de6aa1 --- /dev/null +++ b/extraction/passes/E_outcomes/prompt.txt @@ -0,0 +1,14 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract outcome (boolean if explicitly stated, otherwise `null`), `fluorescence` as `measurement_lite`, and any `comparative_notes`. diff --git a/extraction/passes/E_outcomes/schema.json b/extraction/passes/E_outcomes/schema.json new file mode 100644 index 0000000..c15da38 --- /dev/null +++ b/extraction/passes/E_outcomes/schema.json @@ -0,0 +1,46 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "OutcomesPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "outcome", "fluorescence", "comparative_notes"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "outcome": { "type": ["boolean", "null"] }, + "fluorescence": { "$ref": "#/$defs/measurement_lite" }, + "comparative_notes": { "type": ["string", "null"], "maxLength": 500 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + }, + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "value", "unit"], + "properties": { + "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, + "value": { "type": ["number", "null"] }, + "unit": { "type": ["string", "null"], "maxLength": 50 } + } + } + } +} diff --git a/extraction/passes/F_pairings/prompt.txt b/extraction/passes/F_pairings/prompt.txt new file mode 100644 index 0000000..deba9ff --- /dev/null +++ b/extraction/passes/F_pairings/prompt.txt @@ -0,0 +1,14 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract references to paired probes and relationship (e.g., "same sequence different labels", "reciprocal"). diff --git a/extraction/passes/F_pairings/schema.json b/extraction/passes/F_pairings/schema.json new file mode 100644 index 0000000..ce46748 --- /dev/null +++ b/extraction/passes/F_pairings/schema.json @@ -0,0 +1,33 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "PairingsPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "paired_with_probe_name", "relationship"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "paired_with_probe_name": { "type": ["string", "null"], "maxLength": 200 }, + "relationship": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/common.txt b/extraction/passes/common.txt new file mode 100644 index 0000000..398a711 --- /dev/null +++ b/extraction/passes/common.txt @@ -0,0 +1,13 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: diff --git a/extraction/schemas/full.json b/extraction/schemas/full.json new file mode 100644 index 0000000..a2c9b48 --- /dev/null +++ b/extraction/schemas/full.json @@ -0,0 +1,514 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { "type": "string", "minLength": 1 }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { "type": "string", "minLength": 1 }, + "minItems": 0 + }, + "notes": { + "type": ["string", "null"], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000 + }, + + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "required": ["source_type", "page", "section", "quote", "notes"], + "properties": { + "source_type": { + "type": "string", + "enum": ["pdf", "html", "other", "unknown"], + "description": "Type of source the extractor processed." + }, + "page": { + "type": ["integer", "null"], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": ["string", "null"], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": ["string", "null"], + "description": "Short verbatim snippet that directly supports the value." + }, + "notes": { + "type": ["string", "null"], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": ["raw", "value", "unit", "si_value", "si_unit", "assumptions", "provenance"], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": ["number", "null"], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": ["string", "null"], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": ["number", "null"], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": ["string", "null"], + "enum": ["K", "mol/m^3", "Pa", "kg/m^3", "s", "dimensionless"], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": ["string", "null"], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { "$ref": "#/$defs/provenance" } + } + }, + + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "labels", "sense_antisense", "provenance"], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": ["integer", "null"], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": ["integer", "null"], + "enum": [3, 5, null], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": ["string", "null"], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": ["string", "null"], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { "type": "string" } + }, + "sense_antisense": { + "type": ["string", "null"], + "enum": ["sense", "antisense", null], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { "$ref": "#/$defs/provenance" } + } + }, + + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": ["name", "oligo", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes"], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": ["string", "null"], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": "string", + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": "string", + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": ["string", "null"], + "enum": ["sense", "antisense", null], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": ["string", "null"], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + + "oneOf": [ + { + "title": "Article with experiments/probes", + "type": "object", + "additionalProperties": false, + "required": ["doi", "abstract", "topic", "experiments", "extraction_report"], + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "abstract": { + "type": "string", + "minLength": 10, + "maxLength": 2000, + "description": "Abstract or summary as extracted." + }, + "topic": { + "type": "string", + "minLength": 2, + "maxLength": 100, + "description": "Short topic/category label (e.g., 'mutation scanning by DMA')." + }, + "experiments": { + "type": "array", + "description": "Each element corresponds to a target-probe pair (plus primers/related sequences) and the full experimental context.", + "minItems": 1, + "maxItems": 2000, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "raw_description", "type", "description", "metadata", "sequences","experiment_properties", "outcome", "pairing", "extraction_report"], + "properties": { + "id_exp": { + "type": "string", + "minLength": 1, + "maxLength": 120, + "description": "Unique experiment identifier (derive if needed from amplicon + probe name, e.g., 'N3-FAM-27-s')." + }, + "raw_description": { + "type": ["string", "null"], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "type": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Experiment type (e.g., 'DNA-RNA hybridization', 'real-time PCR', 'DMA')." + }, + "description": { + "type": "string", + "minLength": 10, + "maxLength": 1000, + "description": "Concise human-readable summary of this specific target-probe experiment." + }, + + "metadata": { + "type": "object", + "additionalProperties": false, + "description": "High-level descriptors linked to this experiment.", + "required": ["organism", "technology", "annealing", "pH", "rna_impurities"], + "properties": { + "organism": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Organism (e.g., 'human')." + }, + "technology": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." + }, + "annealing": { + "type": ["object", "null"], + "additionalProperties": false, + "description": "Annealing process details, with optional quantitative and qualitative components.", + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." + }, + "qualitative": { + "type": ["boolean", "null"], + "description": "If the article states a qualitative annealing outcome/criterion." + } + } + }, + "pH": { + "$ref": "#/$defs/measurement", + "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." + }, + "rna_impurities": { + "type": ["object", "null"], + "additionalProperties": false, + "description": "RNA impurity information, if discussed.", + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Quantity/percentage of RNA impurities." + }, + "qualitative": { + "type": ["boolean", "null"], + "description": "Presence/absence or a qualitative statement regarding RNA impurities." + } + } + } + } + }, + + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": ["target_sequence", "probe", "primer_sequences", "related_sequences"], + "properties": { + "target_sequence": { + "oneOf": [ + { "$ref": "#/$defs/decoratedOligo" }, + { "type": "null" } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { "$ref": "#/$defs/primerPair" }, + { "type": "null" } + ], + "description": "PCR primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": ["string", "null"], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": ["concentrations", "parameters_SI"], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": ["dna_rna_concentration", "concentration_SI"], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": ["temperature", "Tris", "Na", "K", "Mg", "DMSO"], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": ["outcome", "fluorescence", "comparative_notes"], + "properties": { + "outcome": { + "type": ["boolean", "null"], + "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": ["string", "null"], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + }, + + "pairing": { + "type": "object", + "additionalProperties": false, + "description": "Optional cross-references to paired/reciprocal probes within the same article.", + "required": ["paired_with_probe_name", "relationship"], + "properties": { + "paired_with_probe_name": { + "type": ["string", "null"], + "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." + }, + "relationship": { + "type": ["string", "null"], + "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + } + } + }, + + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + } + }, + + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + }, + + { + "title": "Article with no hybridization probe sequences", + "type": "object", + "additionalProperties": false, + "required": ["doi", "explanation_why_does_not_this_article_have_any_hybridization_probes_sequences", "extraction_report"], + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "explanation_why_does_not_this_article_have_any_hybridization_probes_sequences": { + "type": "string", + "minLength": 50, + "maxLength": 2000, + "description": "A detailed justification straight from the article explaining the absence of probe sequences." + }, + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + } + ] +} From fd9843d9a1d0f954545b537f00ee43eb1364a7ba Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 02:24:19 +0400 Subject: [PATCH 002/102] Qwen does not work great for this task right now --- extraction/config/pipeline.json | 4 ++-- extraction/schemas/{full.json => article.json} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename extraction/schemas/{full.json => article.json} (100%) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 3f4f7ae..27f9975 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -3,11 +3,11 @@ "num_ctx": 131072, "num_predict": 65536, "timeout_s": 1800, - "input_dir": "outputs/text", + "input_dir": "input/txt/", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", - "article_glob": "input/txt*.txt", + "article_glob": "*.txt", "passes": [ { "name": "A_core", diff --git a/extraction/schemas/full.json b/extraction/schemas/article.json similarity index 100% rename from extraction/schemas/full.json rename to extraction/schemas/article.json From 50845f6ab9592db8940966722f840ecb102ffaa4 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 02:45:45 +0400 Subject: [PATCH 003/102] Added multiple options support into pipeline --- extraction/config/pipeline.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 27f9975..cd6fb8a 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,8 +1,12 @@ { - "model_name": "myaniu/qwen2.5-1m:7b", - "num_ctx": 131072, - "num_predict": 65536, - "timeout_s": 1800, + "model_names": ["llama3.1", "myaniu/qwen2.5-1m:7b", "gemma3:27b", "llava:34b", "phi3", "phi4"], + "ollama_parameters":{ + "num_ctx": 65536, + "num_predict": 32768, + "timeout_s": 1800, + "temperature": 0.15, + "seed": 42 + }, "input_dir": "input/txt/", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", From 2d238dec32ef70448687e80a51788657036dbdbe Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 02:57:27 +0400 Subject: [PATCH 004/102] Before launching on all texts --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index cd6fb8a..1d6170b 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -3,10 +3,10 @@ "ollama_parameters":{ "num_ctx": 65536, "num_predict": 32768, - "timeout_s": 1800, "temperature": 0.15, "seed": 42 }, + "timeout_s": 1800, "input_dir": "input/txt/", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", From 71b76b8b0404b27abbf3c7ce88520da9501e95db Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 02:58:20 +0400 Subject: [PATCH 005/102] Launching on massive data --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 1d6170b..8fe852e 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -7,7 +7,7 @@ "seed": 42 }, "timeout_s": 1800, - "input_dir": "input/txt/", + "input_dir": "../outputs/text", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", From 3c078221bc4531bd99494d0c5ce98b141d309607 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 03:09:17 +0400 Subject: [PATCH 006/102] Reduce timeout --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 8fe852e..e9a0973 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -6,7 +6,7 @@ "temperature": 0.15, "seed": 42 }, - "timeout_s": 1800, + "timeout_s": 300, "input_dir": "../outputs/text", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", From 235980d7be2cf147783766f6800f9d2da6344ce0 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 14:45:26 +0400 Subject: [PATCH 007/102] Will now start with micro-C steps --- extraction/config/pipeline.json | 27 +- extraction/passes/C1_probe_core/prompt.txt | 24 ++ extraction/passes/C1_probe_core/schema.json | 79 +++++ .../passes/C2_target_primers/prompt.txt | 18 ++ .../passes/C2_target_primers/schema.json | 144 +++++++++ extraction/passes/C3_related/prompt.txt | 17 ++ extraction/passes/C3_related/schema.json | 79 +++++ extraction/passes/C_sequences/prompt.txt | 28 ++ extraction/passes/C_sequences/schema.json | 275 +++++++++++++----- 9 files changed, 615 insertions(+), 76 deletions(-) create mode 100644 extraction/passes/C1_probe_core/prompt.txt create mode 100644 extraction/passes/C1_probe_core/schema.json create mode 100644 extraction/passes/C2_target_primers/prompt.txt create mode 100644 extraction/passes/C2_target_primers/schema.json create mode 100644 extraction/passes/C3_related/prompt.txt create mode 100644 extraction/passes/C3_related/schema.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index e9a0973..af7b1c9 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,6 +1,13 @@ { - "model_names": ["llama3.1", "myaniu/qwen2.5-1m:7b", "gemma3:27b", "llava:34b", "phi3", "phi4"], - "ollama_parameters":{ + "model_names": [ + "llama3.1", + "myaniu/qwen2.5-1m:7b", + "gemma3:27b", + "llava:34b", + "phi3", + "phi4" + ], + "ollama_parameters": { "num_ctx": 65536, "num_predict": 32768, "temperature": 0.15, @@ -24,9 +31,19 @@ "prompt": "passes/B_index/prompt.txt" }, { - "name": "C_sequences", - "schema": "passes/C_sequences/schema.json", - "prompt": "passes/C_sequences/prompt.txt" + "name": "C1_probe_core", + "schema": "passes/C1_probe_core/schema.json", + "prompt": "passes/C1_probe_core/prompt.txt" + }, + { + "name": "C2_target_primers", + "schema": "passes/C2_target_primers/schema.json", + "prompt": "passes/C2_target_primers/prompt.txt" + }, + { + "name": "C3_related", + "schema": "passes/C3_related/schema.json", + "prompt": "passes/C3_related/prompt.txt" }, { "name": "D_parameters", diff --git a/extraction/passes/C1_probe_core/prompt.txt b/extraction/passes/C1_probe_core/prompt.txt new file mode 100644 index 0000000..545d188 --- /dev/null +++ b/extraction/passes/C1_probe_core/prompt.txt @@ -0,0 +1,24 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = PROBES ONLY) +For each experiment id_exp, extract exactly one probe: +- probe.name (as printed) +- probe.amplicon_id if present (e.g., K2, K3, N2, N3, B15), else null +- probe.fluorophore, probe.quencher if present, else null +- probe.sense_antisense: “s” → "sense", “as” → "antisense", else null +- probe.notes: any short clarifications (optional) +- probe.oligo: + - raw: EXACT text (must include at least one nucleotide; no ellipses) + - sequence: IUPAC uppercase only, if present; else null + - length_bases: integer if indicated in text, else null + - prime_prefix: 5 or 3 if leading mark is shown, else null + - five_prime_label / three_prime_label: labels at 5′/3′ ends if shown, else null + - sense_antisense: "sense" / "antisense" if explicit in oligo, else null + - modifications[]: enumerate if present; else empty array + +RULES +- If a field is not present in the article, set it to null (or empty array) and add an entry in extraction_report. +- Do NOT invent values. Do NOT output prose. + +OUTPUT +A single JSON object with: { items: [ { id_exp, probe{…} }, … ], extraction_report } diff --git a/extraction/passes/C1_probe_core/schema.json b/extraction/passes/C1_probe_core/schema.json new file mode 100644 index 0000000..d0e0c30 --- /dev/null +++ b/extraction/passes/C1_probe_core/schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ProbeCorePerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { "type": "string", "minLength": 2, "maxLength": 200 }, + "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "notes": { "type": ["string", "null"], "maxLength": 400 }, + + "oligo": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"], "description": "3 or 5 when indicated." }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + } + } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/C2_target_primers/prompt.txt b/extraction/passes/C2_target_primers/prompt.txt new file mode 100644 index 0000000..8fcf225 --- /dev/null +++ b/extraction/passes/C2_target_primers/prompt.txt @@ -0,0 +1,18 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = TARGET + PRIMERS) +For each experiment id_exp: +- target_sequence: same oligo decomposition fields as probes. If no explicit target oligo is printed, set to null. +- primer_sequences: object with forward and reverse oligos, each decomposed like probes (raw, sequence, labels, modifications...). If primers are not listed, set primer_sequences to null. + +IMPORTANT RULES +- oligo.raw is copied EXACTLY and must contain ≥1 nucleotide letter. No ellipses. +- sequence must be IUPAC uppercase: A C G U/T R Y S W K M B D H V N (no spaces/punct.). +- prime_prefix 5/3 only if explicitly shown; otherwise null. +- five_prime_label / three_prime_label if present; otherwise null. +- fluorophore / quencher usually null for primers, but set if printed. +- modifications[] empty when absent. +- If any field is not present, set to null and record in extraction_report; do not guess. + +OUTPUT +A single JSON object with: { items: [ { id_exp, target_sequence, primer_sequences }, … ], extraction_report } diff --git a/extraction/passes/C2_target_primers/schema.json b/extraction/passes/C2_target_primers/schema.json new file mode 100644 index 0000000..f417b99 --- /dev/null +++ b/extraction/passes/C2_target_primers/schema.json @@ -0,0 +1,144 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "TargetAndPrimersPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "target_sequence", "primer_sequences"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + }, + + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/C3_related/prompt.txt b/extraction/passes/C3_related/prompt.txt new file mode 100644 index 0000000..c506546 --- /dev/null +++ b/extraction/passes/C3_related/prompt.txt @@ -0,0 +1,17 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = RELATED SEQUENCES) +For each experiment id_exp, extract zero or more related sequences: +- related_sequences[]: each item has related_sequence (oligo decomposition like probes) and optional description. + +RULES +- oligo.raw copied EXACTLY; must include ≥1 nucleotide letter; no ellipses. +- sequence = IUPAC uppercase only (no spaces/punct.), else null. +- prime_prefix = 5 or 3 if shown, else null. +- five_prime_label / three_prime_label if printed, else null. +- fluorophore / quencher if printed, else null. +- modifications[] empty if absent. +- If not provided in the article, use an empty array. Do NOT invent sequences. + +OUTPUT +A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C3_related/schema.json b/extraction/passes/C3_related/schema.json new file mode 100644 index 0000000..83e13de --- /dev/null +++ b/extraction/passes/C3_related/schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "RelatedSequencesPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "related_sequences"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/C_sequences/prompt.txt b/extraction/passes/C_sequences/prompt.txt index 0984499..3161765 100644 --- a/extraction/passes/C_sequences/prompt.txt +++ b/extraction/passes/C_sequences/prompt.txt @@ -14,3 +14,31 @@ Perform the following tasks: * For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. * The `oligo_lite.raw` must contain nucleotides and no ellipses. * Keep labels like FAM/ROX/BHQ2 in the text; if article does not mention them explicitly, leave derived fields `null`. + +You are an extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). + +STRICT RULES +- Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. +- Copy oligo.raw exactly as printed in the article (no ellipses, no placeholders). oligo.raw MUST include at least one nucleotide letter. +- sequence must be IUPAC uppercase only: A C G U/T R Y S W K M B D H V N (no spaces, no punctuation). +- prime_prefix is 5 or 3 when the prefix like “5′-” or “3′-” is present, else null. +- five_prime_label / three_prime_label: labels at 5′/3′ ends (e.g., FAM, ROX, BHQ1, BHQ2, RTQ1), else null. +- fluorophore / quencher: extract if present, else null. +- sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. +- modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. +- related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). + +TASK +From the article text, produce per-experiment items with: +- id_exp +- probe { name, amplicon_id?, fluorophore?, quencher?, sense_antisense?, notes?, oligo{raw, sequence?, length_bases?, prime_prefix?, five_prime_label?, three_prime_label?, sense_antisense?, modifications[]}} +- target_sequence (same oligo decomposition) or null +- primer_sequences {forward oligo, reverse oligo} or null +- related_sequences[] {related_sequence oligo, description?} + +EXTRACTION REPORT +- Put any truly unavailable or ambiguous fields in extraction_report.missing / extraction_report.uncertain. +- Do NOT hallucinate. Prefer null + report over guesses. + +OUTPUT +Return exactly one JSON object that conforms to the schema. No prose. diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index 4d3c36f..56f7cc4 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -14,31 +14,220 @@ "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], "properties": { "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, - "target_sequence": { "oneOf": [ { "$ref": "#/$defs/oligo_lite" }, { "type": "null" } ] }, - "probe": { "$ref": "#/$defs/probe_lite" }, - "primer_sequences": { - "oneOf": [ - { + + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { "type": "string", "minLength": 2, "maxLength": 200 }, + "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "notes": { "type": ["string", "null"], "maxLength": 400 }, + + "oligo": { "type": "object", "additionalProperties": false, - "required": ["forward", "reverse"], + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications" + ], "properties": { - "forward": { "$ref": "#/$defs/oligo_lite" }, - "reverse": { "$ref": "#/$defs/oligo_lite" } + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "pattern": ".*[ACGTIU].*", + "description": "Exact as printed; must contain at least one nucleotide; no ellipses." + }, + "sequence": { + "type": ["string", "null"], + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000, + "description": "IUPAC uppercase only; null if not stated." + }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"], "description": "3 or 5 when present." }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + } + } + } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } } }, - { "type": "null" } - ] + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + }, + "description": "Target oligo if given; keep same structure for uniformity; fluor/quencher usually null." }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + } + } + }, + "related_sequences": { "type": "array", "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["related_sequence"], + "required": ["related_sequence", "description"], "properties": { - "related_sequence": { "$ref": "#/$defs/oligo_lite" }, + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", "sequence", "length_bases", + "prime_prefix", "five_prime_label", "three_prime_label", + "sense_antisense", "modifications", "fluorophore", "quencher" + ], + "properties": { + "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, + "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "type": ["integer", "null"] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, + "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 120 }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, + "quencher": { "type": ["string", "null"], "maxLength": 60 } + } + }, "description": { "type": ["string", "null"], "maxLength": 200 } } } @@ -46,72 +235,16 @@ } } }, + "extraction_report": { "type": "object", "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" } }, - "uncertain": { "type": "array", "items": { "type": "string" } }, + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, "notes": { "type": ["string", "null"] } } } - }, - "$defs": { - "iupacBases": { - "type": "string", - "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", - "pattern": "^[ACGUTRYSWKMBDHVN]+$", - "minLength": 5, - "maxLength": 5000 - }, - "measurement_lite": { - "type": "object", - "additionalProperties": false, - "required": ["raw", "value", "unit"], - "properties": { - "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, - "value": { "type": ["number", "null"] }, - "unit": { "type": ["string", "null"], "maxLength": 50 } - } - }, - "oligo_lite": { - "type": "object", - "additionalProperties": false, - "required": ["raw", "sequence", "length_bases", "five_prime_label", "three_prime_label", "sense_antisense"], - "properties": { - "raw": { - "type": "string", - "minLength": 5, - "maxLength": 200, - "pattern": "^[A-Za-z0-9\\-\\(\\)\\[\\]'\"/\\+\\sµ′’]*[ACGTIU][A-Za-z0-9\\-\\(\\)\\[\\]'\"/\\+\\sµ′’]*$", - "description": "Keep exactly as printed; must include at least one nucleotide; no ellipses." - }, - "sequence": { - "$ref": "#/$defs/iupacBases" - }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 30 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 30 }, - "sense_antisense": { - "type": ["string", "null"], - "enum": ["sense", "antisense", null] - } - } - }, - "probe_lite": { - "type": "object", - "additionalProperties": false, - "required": ["name", "oligo", "fluorophore", "quencher", "sense_antisense", "amplicon_id", "notes"], - "properties": { - "name": { "type": "string", "minLength": 2, "maxLength": 200 }, - "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, - "oligo": { "$ref": "#/$defs/oligo_lite" }, - "fluorophore": { "type": ["string", "null"], "maxLength": 40 }, - "quencher": { "type": ["string", "null"], "maxLength": 40 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, - "notes": { "type": ["string", "null"], "maxLength": 400 } - } - } } } From d9fa7d1c3e1b03c56ce380bf279167b578668c96 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 14:48:59 +0400 Subject: [PATCH 008/102] Add all schemas --- extraction/config/pipeline.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index af7b1c9..39ca469 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -45,6 +45,11 @@ "schema": "passes/C3_related/schema.json", "prompt": "passes/C3_related/prompt.txt" }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt" + }, { "name": "D_parameters", "schema": "passes/D_parameters/schema.json", From 0866a2ec9f9e51882fe8ef5ee18aadaf7e4aea41 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 23:05:48 +0400 Subject: [PATCH 009/102] Update schema to try pass Ollama limitations --- extraction/passes/C1_probe_core/schema.json | 55 ++--- .../passes/C2_target_primers/schema.json | 111 ++++------ extraction/passes/C3_related/schema.json | 49 ++--- extraction/passes/C_sequences/schema.json | 203 +++++++----------- 4 files changed, 171 insertions(+), 247 deletions(-) diff --git a/extraction/passes/C1_probe_core/schema.json b/extraction/passes/C1_probe_core/schema.json index d0e0c30..638c788 100644 --- a/extraction/passes/C1_probe_core/schema.json +++ b/extraction/passes/C1_probe_core/schema.json @@ -1,59 +1,51 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "ProbeCorePerExperiment", + "title": "ProbeCorePerExperiment (generation schema)", "type": "object", "additionalProperties": false, "required": ["items", "extraction_report"], "properties": { "items": { "type": "array", - "minItems": 1, "items": { "type": "object", "additionalProperties": false, "required": ["id_exp", "probe"], "properties": { - "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, - + "id_exp": { "type": "string" }, "probe": { "type": "object", "additionalProperties": false, "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], "properties": { - "name": { "type": "string", "minLength": 2, "maxLength": 200 }, - "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, - "notes": { "type": ["string", "null"], "maxLength": 400 }, - + "name": { }, + "amplicon_id": { }, + "fluorophore": { }, + "quencher": { }, + "sense_antisense": { }, + "notes": { }, "oligo": { "type": "object", "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"], "description": "3 or 5 when indicated." }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } } @@ -64,15 +56,14 @@ } } }, - "extraction_report": { "type": "object", "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "notes": { "type": ["string", "null"] } + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } } } } diff --git a/extraction/passes/C2_target_primers/schema.json b/extraction/passes/C2_target_primers/schema.json index f417b99..4301eb2 100644 --- a/extraction/passes/C2_target_primers/schema.json +++ b/extraction/passes/C2_target_primers/schema.json @@ -1,52 +1,46 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "TargetAndPrimersPerExperiment", + "title": "TargetAndPrimersPerExperiment (generation schema)", "type": "object", "additionalProperties": false, "required": ["items", "extraction_report"], "properties": { "items": { "type": "array", - "minItems": 1, "items": { "type": "object", "additionalProperties": false, "required": ["id_exp", "target_sequence", "primer_sequences"], "properties": { - "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "id_exp": { "type": "string" }, "target_sequence": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } }, @@ -58,70 +52,59 @@ "forward": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } }, - "reverse": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } } } @@ -135,9 +118,9 @@ "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "notes": { "type": ["string", "null"] } + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } } } } diff --git a/extraction/passes/C3_related/schema.json b/extraction/passes/C3_related/schema.json index 83e13de..7cb7016 100644 --- a/extraction/passes/C3_related/schema.json +++ b/extraction/passes/C3_related/schema.json @@ -1,23 +1,20 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "RelatedSequencesPerExperiment", + "title": "RelatedSequencesPerExperiment (generation schema)", "type": "object", "additionalProperties": false, "required": ["items", "extraction_report"], "properties": { "items": { "type": "array", - "minItems": 1, "items": { "type": "object", "additionalProperties": false, "required": ["id_exp", "related_sequences"], "properties": { - "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, - + "id_exp": { "type": "string" }, "related_sequences": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, @@ -26,53 +23,47 @@ "related_sequence": { "type": "object", "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "description": { } } } } } } }, - "extraction_report": { "type": "object", "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "notes": { "type": ["string", "null"] } + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } } } } diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index 56f7cc4..199d500 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -1,71 +1,52 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "SequencesPerExperiment", + "title": "SequencesPerExperiment (generation schema)", "type": "object", "additionalProperties": false, "required": ["items", "extraction_report"], "properties": { "items": { "type": "array", - "minItems": 1, "items": { "type": "object", "additionalProperties": false, "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], "properties": { - "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "id_exp": { "type": "string" }, "probe": { "type": "object", "additionalProperties": false, "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], "properties": { - "name": { "type": "string", "minLength": 2, "maxLength": 200 }, - "amplicon_id": { "type": ["string", "null"], "maxLength": 40 }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, - "notes": { "type": ["string", "null"], "maxLength": 400 }, - + "name": { }, + "amplicon_id": { }, + "fluorophore": { }, + "quencher": { }, + "sense_antisense": { }, + "notes": { }, "oligo": { "type": "object", "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], "properties": { - "raw": { - "type": "string", - "minLength": 5, - "maxLength": 200, - "pattern": ".*[ACGTIU].*", - "description": "Exact as printed; must contain at least one nucleotide; no ellipses." - }, - "sequence": { - "type": ["string", "null"], - "pattern": "^[ACGUTRYSWKMBDHVN]+$", - "minLength": 5, - "maxLength": 5000, - "description": "IUPAC uppercase only; null if not stated." - }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"], "description": "3 or 5 when present." }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } } @@ -77,37 +58,31 @@ "target_sequence": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } - }, - "description": "Target oligo if given; keep same structure for uniformity; fluor/quencher usually null." + "fluorophore": { }, + "quencher": { } + } }, "primer_sequences": { @@ -118,69 +93,59 @@ "forward": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } }, "reverse": { "type": ["object", "null"], "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } } } @@ -188,7 +153,6 @@ "related_sequences": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, @@ -197,38 +161,33 @@ "related_sequence": { "type": "object", "additionalProperties": false, - "required": [ - "raw", "sequence", "length_bases", - "prime_prefix", "five_prime_label", "three_prime_label", - "sense_antisense", "modifications", "fluorophore", "quencher" - ], + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { "type": "string", "minLength": 5, "maxLength": 200, "pattern": ".*[ACGTIU].*" }, - "sequence": { "type": ["string", "null"], "pattern": "^[ACGUTRYSWKMBDHVN]+$", "minLength": 5, "maxLength": 5000 }, - "length_bases": { "type": ["integer", "null"], "minimum": 1 }, - "prime_prefix": { "type": ["integer", "null"] }, - "five_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "three_prime_label": { "type": ["string", "null"], "maxLength": 60 }, - "sense_antisense": { "type": ["string", "null"], "enum": ["sense", "antisense", null] }, + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, "modifications": { "type": "array", - "minItems": 0, "items": { "type": "object", "additionalProperties": false, - "required": ["modification_position", "modification_type"], + "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": ["integer", "null"], "minimum": 1 }, - "modification_type": { "type": ["string", "null"], "maxLength": 120 }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "modification_position": { }, + "modification_type": { }, + "description": { } } } }, - "fluorophore": { "type": ["string", "null"], "maxLength": 60 }, - "quencher": { "type": ["string", "null"], "maxLength": 60 } + "fluorophore": { }, + "quencher": { } } }, - "description": { "type": ["string", "null"], "maxLength": 200 } + "description": { } } } } @@ -241,9 +200,9 @@ "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, - "notes": { "type": ["string", "null"] } + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } } } } From a58254c36b5d17b502d1e82e1122f17f5993dedc Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Wed, 1 Oct 2025 23:13:08 +0400 Subject: [PATCH 010/102] Will retry with the smaller outputs --- extraction/config/pipeline.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 39ca469..c636627 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -8,8 +8,8 @@ "phi4" ], "ollama_parameters": { - "num_ctx": 65536, - "num_predict": 32768, + "num_ctx": 8192, + "num_predict": 4096, "temperature": 0.15, "seed": 42 }, From 319b296b533f1131cf5d9738a5929e6cb9dd97b2 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 00:12:15 +0400 Subject: [PATCH 011/102] Looks better for schema C --- extraction/config/pipeline.json | 35 ------------------- .../passes/C2_target_primers/schema.json | 2 +- extraction/passes/C_sequences/schema.json | 12 +++---- 3 files changed, 7 insertions(+), 42 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index c636627..74ca47c 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -20,31 +20,6 @@ "db_path": "outlines_output/massive.sqlite", "article_glob": "*.txt", "passes": [ - { - "name": "A_core", - "schema": "passes/A_core/schema.json", - "prompt": "passes/A_core/prompt.txt" - }, - { - "name": "B_index", - "schema": "passes/B_index/schema.json", - "prompt": "passes/B_index/prompt.txt" - }, - { - "name": "C1_probe_core", - "schema": "passes/C1_probe_core/schema.json", - "prompt": "passes/C1_probe_core/prompt.txt" - }, - { - "name": "C2_target_primers", - "schema": "passes/C2_target_primers/schema.json", - "prompt": "passes/C2_target_primers/prompt.txt" - }, - { - "name": "C3_related", - "schema": "passes/C3_related/schema.json", - "prompt": "passes/C3_related/prompt.txt" - }, { "name": "C_sequences", "schema": "passes/C_sequences/schema.json", @@ -54,16 +29,6 @@ "name": "D_parameters", "schema": "passes/D_parameters/schema.json", "prompt": "passes/D_parameters/prompt.txt" - }, - { - "name": "E_outcomes", - "schema": "passes/E_outcomes/schema.json", - "prompt": "passes/E_outcomes/prompt.txt" - }, - { - "name": "F_pairings", - "schema": "passes/F_pairings/schema.json", - "prompt": "passes/F_pairings/prompt.txt" } ] } \ No newline at end of file diff --git a/extraction/passes/C2_target_primers/schema.json b/extraction/passes/C2_target_primers/schema.json index 4301eb2..d4b82f8 100644 --- a/extraction/passes/C2_target_primers/schema.json +++ b/extraction/passes/C2_target_primers/schema.json @@ -83,7 +83,7 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { }, + "raw": { }, "sequence": { }, "length_bases": { }, "prime_prefix": { }, diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index 199d500..60014e9 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -19,12 +19,12 @@ "additionalProperties": false, "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], "properties": { - "name": { }, - "amplicon_id": { }, - "fluorophore": { }, - "quencher": { }, - "sense_antisense": { }, - "notes": { }, + "name": { "type": "string" }, + "amplicon_id": { "type" :["string", "null"], "maxLength": 100 }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100 }, + "quencher": { "type" :["string", "null"], "maxLength": 100 }, + "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, + "notes": { "type" :["string", "null"], "maxLength": 100 }, "oligo": { "type": "object", "additionalProperties": false, From 2ed1f6d9813130441d431897e0efce53d38f8a97 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 00:12:46 +0400 Subject: [PATCH 012/102] Attempt to test schema C with longer context --- extraction/config/pipeline.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 74ca47c..03179ec 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -8,8 +8,8 @@ "phi4" ], "ollama_parameters": { - "num_ctx": 8192, - "num_predict": 4096, + "num_ctx": 65536, + "num_predict": 32768, "temperature": 0.15, "seed": 42 }, From 6b8da65df335f304164b1fae6e62d1f2acf8f63b Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 00:23:36 +0400 Subject: [PATCH 013/102] Tried to update schema C more strictly specifying types --- extraction/passes/C_sequences/schema.json | 110 ++++++++++++---------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index 60014e9..3e5350b 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -19,10 +19,10 @@ "additionalProperties": false, "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], "properties": { - "name": { "type": "string" }, + "name": { "type": "string", "maxLength": 500 }, "amplicon_id": { "type" :["string", "null"], "maxLength": 100 }, - "fluorophore": { "type" :["string", "null"], "maxLength": 100 }, - "quencher": { "type" :["string", "null"], "maxLength": 100 }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, "notes": { "type" :["string", "null"], "maxLength": 100 }, "oligo": { @@ -30,13 +30,13 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], "properties": { - "raw": { }, - "sequence": { }, - "length_bases": { }, - "prime_prefix": { }, - "five_prime_label": { }, - "three_prime_label": { }, - "sense_antisense": { }, + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, "modifications": { "type": "array", "items": { @@ -44,9 +44,9 @@ "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { }, - "modification_type": { }, - "description": { } + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} } } } @@ -60,13 +60,13 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { }, - "sequence": { }, - "length_bases": { }, - "prime_prefix": { }, - "five_prime_label": { }, - "three_prime_label": { }, - "sense_antisense": { }, + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, "modifications": { "type": "array", "items": { @@ -74,14 +74,16 @@ "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { }, - "modification_type": { }, - "description": { } + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} } - } + }, + "minItems": 0, + "maxItems": 100 }, - "fluorophore": { }, - "quencher": { } + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } } }, @@ -95,13 +97,13 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { }, - "sequence": { }, - "length_bases": { }, - "prime_prefix": { }, - "five_prime_label": { }, - "three_prime_label": { }, - "sense_antisense": { }, + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, "modifications": { "type": "array", "items": { @@ -109,14 +111,16 @@ "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { }, - "modification_type": { }, - "description": { } + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} } - } + }, + "minItems": 0, + "maxItems": 100 }, - "fluorophore": { }, - "quencher": { } + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } } }, "reverse": { @@ -124,13 +128,13 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { }, - "sequence": { }, - "length_bases": { }, - "prime_prefix": { }, - "five_prime_label": { }, - "three_prime_label": { }, - "sense_antisense": { }, + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, "modifications": { "type": "array", "items": { @@ -138,14 +142,16 @@ "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { }, - "modification_type": { }, - "description": { } + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} } - } + }, + "minItems": 0, + "maxItems": 100 }, - "fluorophore": { }, - "quencher": { } + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } } } } From b5d2dbe38cf2d08aeb0c1d652b3fde9e415bd0fa Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 01:04:38 +0400 Subject: [PATCH 014/102] C5 schema created --- extraction/config/pipeline.json | 6 +- extraction/passes/C4_probe_target/prompt.txt | 17 ++ extraction/passes/C4_probe_target/schema.json | 174 ++++++++++++++++++ .../passes/C5_probes_opt_target/prompt.txt | 44 +++++ .../passes/C5_probes_opt_target/schema.json | 110 +++++++++++ extraction/passes/C_sequences/schema.json | 144 ++++++++------- 6 files changed, 421 insertions(+), 74 deletions(-) create mode 100644 extraction/passes/C4_probe_target/prompt.txt create mode 100644 extraction/passes/C4_probe_target/schema.json create mode 100644 extraction/passes/C5_probes_opt_target/prompt.txt create mode 100644 extraction/passes/C5_probes_opt_target/schema.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 03179ec..cae3438 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -21,9 +21,9 @@ "article_glob": "*.txt", "passes": [ { - "name": "C_sequences", - "schema": "passes/C_sequences/schema.json", - "prompt": "passes/C_sequences/prompt.txt" + "name": "C5_probes_opt_target", + "schema": "passes/C5_probes_opt_target/schema.json", + "prompt": "passes/C5_probes_opt_target/prompt.txt" }, { "name": "D_parameters", diff --git a/extraction/passes/C4_probe_target/prompt.txt b/extraction/passes/C4_probe_target/prompt.txt new file mode 100644 index 0000000..c506546 --- /dev/null +++ b/extraction/passes/C4_probe_target/prompt.txt @@ -0,0 +1,17 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = RELATED SEQUENCES) +For each experiment id_exp, extract zero or more related sequences: +- related_sequences[]: each item has related_sequence (oligo decomposition like probes) and optional description. + +RULES +- oligo.raw copied EXACTLY; must include ≥1 nucleotide letter; no ellipses. +- sequence = IUPAC uppercase only (no spaces/punct.), else null. +- prime_prefix = 5 or 3 if shown, else null. +- five_prime_label / three_prime_label if printed, else null. +- fluorophore / quencher if printed, else null. +- modifications[] empty if absent. +- If not provided in the article, use an empty array. Do NOT invent sequences. + +OUTPUT +A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C4_probe_target/schema.json b/extraction/passes/C4_probe_target/schema.json new file mode 100644 index 0000000..a2256c4 --- /dev/null +++ b/extraction/passes/C4_probe_target/schema.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], + "properties": { + "id_exp": { "type": "string" }, + + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { "type": "string", "maxLength": 500 }, + "amplicon_id": { "type" :["string", "null"], "maxLength": 100 }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "notes": { "type" :["string", "null"], "maxLength": 100 }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + } + } + } + } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C5_probes_opt_target/prompt.txt b/extraction/passes/C5_probes_opt_target/prompt.txt new file mode 100644 index 0000000..3161765 --- /dev/null +++ b/extraction/passes/C5_probes_opt_target/prompt.txt @@ -0,0 +1,44 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. + +For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. +* The `oligo_lite.raw` must contain nucleotides and no ellipses. +* Keep labels like FAM/ROX/BHQ2 in the text; if article does not mention them explicitly, leave derived fields `null`. + +You are an extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). + +STRICT RULES +- Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. +- Copy oligo.raw exactly as printed in the article (no ellipses, no placeholders). oligo.raw MUST include at least one nucleotide letter. +- sequence must be IUPAC uppercase only: A C G U/T R Y S W K M B D H V N (no spaces, no punctuation). +- prime_prefix is 5 or 3 when the prefix like “5′-” or “3′-” is present, else null. +- five_prime_label / three_prime_label: labels at 5′/3′ ends (e.g., FAM, ROX, BHQ1, BHQ2, RTQ1), else null. +- fluorophore / quencher: extract if present, else null. +- sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. +- modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. +- related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). + +TASK +From the article text, produce per-experiment items with: +- id_exp +- probe { name, amplicon_id?, fluorophore?, quencher?, sense_antisense?, notes?, oligo{raw, sequence?, length_bases?, prime_prefix?, five_prime_label?, three_prime_label?, sense_antisense?, modifications[]}} +- target_sequence (same oligo decomposition) or null +- primer_sequences {forward oligo, reverse oligo} or null +- related_sequences[] {related_sequence oligo, description?} + +EXTRACTION REPORT +- Put any truly unavailable or ambiguous fields in extraction_report.missing / extraction_report.uncertain. +- Do NOT hallucinate. Prefer null + report over guesses. + +OUTPUT +Return exactly one JSON object that conforms to the schema. No prose. diff --git a/extraction/passes/C5_probes_opt_target/schema.json b/extraction/passes/C5_probes_opt_target/schema.json new file mode 100644 index 0000000..dc42fd3 --- /dev/null +++ b/extraction/passes/C5_probes_opt_target/schema.json @@ -0,0 +1,110 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "properties": { + "id_exp": { "type": "string", "maxLength": 100 }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "oligo"], + "properties": { + "name": { "type": "string", "maxLength": 500 }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] } + } + } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] } + } + } + } + }, + + "related_sequences": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] } + } + }, + "description": { } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index 3e5350b..bd34650 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -7,6 +7,7 @@ "properties": { "items": { "type": "array", + "minItems": 1, "items": { "type": "object", "additionalProperties": false, @@ -20,11 +21,12 @@ "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], "properties": { "name": { "type": "string", "maxLength": 500 }, - "amplicon_id": { "type" :["string", "null"], "maxLength": 100 }, - "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, - "notes": { "type" :["string", "null"], "maxLength": 100 }, + "amplicon_id": { "type": ["string", "null"], "maxLength": 100 }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "notes": { "type": ["string", "null"], "maxLength": 100 }, + "oligo": { "type": "object", "additionalProperties": false, @@ -32,21 +34,22 @@ "properties": { "raw": { "type": "string", "maxLength": 500 }, "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "length_bases": { "type": "number" }, - "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, "modifications": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": "number" }, - "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "description": { "type": "string", "maxLength": 100} + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } } } } @@ -61,29 +64,28 @@ "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "length_bases": { "type": "number" }, - "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, "modifications": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": "number" }, - "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "description": { "type": "string", "maxLength": 100} + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } } - }, - "minItems": 0, - "maxItems": 100 + } }, - "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } } }, @@ -98,29 +100,28 @@ "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "length_bases": { "type": "number" }, - "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, "modifications": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": "number" }, - "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "description": { "type": "string", "maxLength": 100} + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } } - }, - "minItems": 0, - "maxItems": 100 + } }, - "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } } }, "reverse": { @@ -129,29 +130,28 @@ "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "length_bases": { "type": "number" }, - "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "sense_antisense": {"type": ["string", "null"], "enum": ["sense", "antisense"]}, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, "modifications": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { "type": "number" }, - "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "description": { "type": "string", "maxLength": 100} + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } } - }, - "minItems": 0, - "maxItems": 100 + } }, - "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, - "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } } } } @@ -159,6 +159,7 @@ "related_sequences": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, @@ -169,31 +170,32 @@ "additionalProperties": false, "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { - "raw": { }, - "sequence": { }, - "length_bases": { }, - "prime_prefix": { }, - "five_prime_label": { }, - "three_prime_label": { }, - "sense_antisense": { }, + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, "modifications": { "type": "array", + "minItems": 0, "items": { "type": "object", "additionalProperties": false, "required": ["modification_position", "modification_type", "description"], "properties": { - "modification_position": { }, - "modification_type": { }, - "description": { } + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } } } }, - "fluorophore": { }, - "quencher": { } + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } } }, - "description": { } + "description": { "type": ["string", "null"], "maxLength": 200 } } } } @@ -206,9 +208,9 @@ "additionalProperties": false, "required": ["missing", "uncertain", "notes"], "properties": { - "missing": { "type": "array", "items": { "type": "string" } }, - "uncertain": { "type": "array", "items": { "type": "string" } }, - "notes": { } + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } } } } From ffd5bc64a1cd1f74653ea52cbe394a0a6e0e2d93 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 01:07:20 +0400 Subject: [PATCH 015/102] C5 schema works, but not ideal --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index cae3438..c8ec5ef 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -14,7 +14,7 @@ "seed": 42 }, "timeout_s": 300, - "input_dir": "../outputs/text", + "input_dir": "input/txt", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", From 8c40a1ca3815b6e443e6e4f36067ce90808e422e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 01:58:35 +0400 Subject: [PATCH 016/102] Added descriptions to all fields in C5 --- .../passes/C5_probes_opt_target/schema.json | 157 +++++++++--------- 1 file changed, 77 insertions(+), 80 deletions(-) diff --git a/extraction/passes/C5_probes_opt_target/schema.json b/extraction/passes/C5_probes_opt_target/schema.json index dc42fd3..e54f80b 100644 --- a/extraction/passes/C5_probes_opt_target/schema.json +++ b/extraction/passes/C5_probes_opt_target/schema.json @@ -1,109 +1,106 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "SequencesPerExperiment (generation schema)", - "type": "object", + "type": "array", "additionalProperties": false, "required": ["items", "extraction_report"], - "properties": { - "items": { - "type": "array", - "items": { + "description": "This JSON object represents a list of all probes present in article together with their target sequences and primers (if present, must be filled out). Each experiment represents a single object in this array.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "description": "This object represents a single experiment record defined by the probe. There must be exclusive record for each probe found in the full article text.", + "properties": { + "id_exp": { "type": "string", "maxLength": 100, "description": "ID of the experiment, unique string taken either from the article or synthesized to be unique and consequtive in the array." }, + "probe": { "type": "object", "additionalProperties": false, - "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "required": ["name", "raw", "sequence", "sense_antisense"], + "description": "This object represents a single probe found in the article. Such object must be created for each probe present in the article.", "properties": { - "id_exp": { "type": "string", "maxLength": 100 }, - "probe": { - "type": "object", - "additionalProperties": false, - "required": ["name", "oligo"], - "properties": { - "name": { "type": "string", "maxLength": 500 }, - "oligo": { - "type": "object", - "additionalProperties": false, - "required": ["raw", "sequence", "sense_antisense"], - "properties": { - "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "sense_antisense": { "enum": ["sense", "antisense", null] } - } - } - } - }, + "name": { "type": "string", "maxLength": 500, "description": "Name of the probe as provided in the article or description of the probe created by you." }, + "raw": { "type": "string", "maxLength": 500, "description": "Direct text quote from the article describing current single probe." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "IUPAC sequence of the probe in the same order as given in the article." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the sequence of this probe provided in the article was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, - "target_sequence": { + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "This object describes the target sequence for which the probe from the current experiment record was constructed. Omit by providing null if and only if article does not provide the target sequence explicitly.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Target sequence for which the probe from the current experiment was constructed. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Target sequence in IUPAC format. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify target sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided target sequence of this probe was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "description": "This object holds information about the primers used for the current experiment record for the probe. Omit by providing null here if and only if primers are not specified for the current probe in the article text.", + "properties": { + "forward": { "type": ["object", "null"], "additionalProperties": false, "required": ["raw", "sequence", "sense_antisense"], + "description": "Forward primer in this experiment. Omit by providing null here if and only if forward is not specified for the current probe in the article text.", "properties": { - "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "sense_antisense": { "enum": ["sense", "antisense", null] } + "raw": { "type": "string", "maxLength": 500, "description": "Forward primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Forward primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify forward primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided forward primer sequence of this probe was sense or antisense. Set to null if and only if strand of this forward primer can't be inferred from the article text." } } }, - - "primer_sequences": { + "reverse": { "type": ["object", "null"], "additionalProperties": false, - "required": ["forward", "reverse"], + "required": ["raw", "sequence", "sense_antisense"], + "description": "Reverse primer in this experiment. Omit by providing null here if and only if reverse is not specified for the current probe in the article text.", "properties": { - "forward": { - "type": ["object", "null"], - "additionalProperties": false, - "required": ["raw", "sequence", "sense_antisense"], - "properties": { - "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "sense_antisense": { "enum": ["sense", "antisense", null] } - } - }, - "reverse": { - "type": ["object", "null"], - "additionalProperties": false, - "required": ["raw", "sequence", "sense_antisense"], - "properties": { - "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "sense_antisense": { "enum": ["sense", "antisense", null] } - } - } + "raw": { "type": "string", "maxLength": 500, "description": "Reverse primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Reverse primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify reverse primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided reverse primer sequence of this probe was sense or antisense. Set to null if and only if strand of this reverse primer can't be inferred from the article text." } } - }, + } + } + }, - "related_sequences": { - "type": "array", - "items": { + "related_sequences": { + "type": "array", + "description": "Array containing any other related sequences relating to the probe, target or primers in the current experiment if present in article. Omit by providing an empty array if and only if article does not specify any related sequences for the current experiment.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { "type": "object", "additionalProperties": false, - "required": ["related_sequence", "description"], + "required": ["raw", "sequence", "sense_antisense"], + "description": "Related sequence record for the current experiment.", "properties": { - "related_sequence": { - "type": "object", - "additionalProperties": false, - "required": ["raw", "sequence", "sense_antisense"], - "properties": { - "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, - "sense_antisense": { "enum": ["sense", "antisense", null] } - } - }, - "description": { } + "raw": { "type": "string", "maxLength": 500, "description": "Related sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Related sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify related sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided related sequence of this probe was sense or antisense. Set to null if and only if strand of this related can't be inferred from the article text." } } - } + }, + "description": { "type": "string", "maxLength": 500, "description": "Name and/or description of this related sequence, explaining its relation to the current experiment."} } } - } - }, - - "extraction_report": { - "type": "object", - "additionalProperties": false, - "required": ["missing", "uncertain", "notes"], - "properties": { - "missing": { "type": "array", "items": { "type": "string" } }, - "uncertain": { "type": "array", "items": { "type": "string" } }, - "notes": { } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "description": "Information about technical specifics of the current experiment extraction.", + "properties": { + "missing": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing missing elements which where set to null or omitted. Each record must be a JSON path to corresponding field." }, + "uncertain": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing uncertain elements which article mentions vaguely or controversely. Each record must be a JSON path to corresponding field." }, + "notes": { "type": "string", "description": "Any other notes related to the extraction of this experiment", "maxLength": 500 } + } } } } From 5bd0b1bb98ca0ee89bfc70836b7fc00c91635a65 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 03:00:53 +0400 Subject: [PATCH 017/102] Temporarily switch off tools --- .../passes/C5_probes_opt_target/prompt.txt | 155 +++++++++++++++--- 1 file changed, 128 insertions(+), 27 deletions(-) diff --git a/extraction/passes/C5_probes_opt_target/prompt.txt b/extraction/passes/C5_probes_opt_target/prompt.txt index 3161765..18e9ddc 100644 --- a/extraction/passes/C5_probes_opt_target/prompt.txt +++ b/extraction/passes/C5_probes_opt_target/prompt.txt @@ -1,44 +1,145 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). -* Never invent values; use `null` when unknown. -* Keep text exactly as in the article (no ellipses, no expansions). -* Output all data fully, never skip or insert ellipses. -* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. -* Do not copy sequences from examples! -* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +General rules: +- Never invent values; use `null` when unknown. +- Keep text exactly as in the article (no ellipses, no expansions). +- Output all data fully, never skip or insert ellipses. +- If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +- Use the article’s wording for names. +- Do not copy sequences from examples! +- No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. -For the perfect result compliant to all constraints and limitations I will tip $2000! - -Perform the following tasks: -* For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. -* The `oligo_lite.raw` must contain nucleotides and no ellipses. -* Keep labels like FAM/ROX/BHQ2 in the text; if article does not mention them explicitly, leave derived fields `null`. - -You are an extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). - -STRICT RULES +STRICT RULES: - Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. -- Copy oligo.raw exactly as printed in the article (no ellipses, no placeholders). oligo.raw MUST include at least one nucleotide letter. +- Copy oligo raw exactly as printed in the article (no ellipses, no placeholders). oligo raw MUST include at least one nucleotide letter. - sequence must be IUPAC uppercase only: A C G U/T R Y S W K M B D H V N (no spaces, no punctuation). -- prime_prefix is 5 or 3 when the prefix like “5′-” or “3′-” is present, else null. -- five_prime_label / three_prime_label: labels at 5′/3′ ends (e.g., FAM, ROX, BHQ1, BHQ2, RTQ1), else null. -- fluorophore / quencher: extract if present, else null. - sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. - modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. - related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). -TASK -From the article text, produce per-experiment items with: +TASK: +From the article text, produce per-experiment array of records with: - id_exp -- probe { name, amplicon_id?, fluorophore?, quencher?, sense_antisense?, notes?, oligo{raw, sequence?, length_bases?, prime_prefix?, five_prime_label?, three_prime_label?, sense_antisense?, modifications[]}} +- probe { name, raw, sequence, sense_antisense?}} - target_sequence (same oligo decomposition) or null - primer_sequences {forward oligo, reverse oligo} or null - related_sequences[] {related_sequence oligo, description?} +Extract all probes, all sequences! All that are found in the article's text. No probe, no target sequence and no primer are to be skipped. Use all in the records. -EXTRACTION REPORT +EXTRACTION REPORT: - Put any truly unavailable or ambiguous fields in extraction_report.missing / extraction_report.uncertain. - Do NOT hallucinate. Prefer null + report over guesses. -OUTPUT +JSON SCHEMA you MUST FOLLOW: +``` +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "array", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "description": "This JSON object represents a list of all probes present in article together with their target sequences and primers (if present, must be filled out). Each experiment represents a single object in this array.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "description": "This object represents a single experiment record defined by the probe. There must be exclusive record for each probe found in the full article text.", + "properties": { + "id_exp": { "type": "string", "maxLength": 100, "description": "ID of the experiment, unique string taken either from the article or synthesized to be unique and consequtive in the array." }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "raw", "sequence", "sense_antisense"], + "description": "This object represents a single probe found in the article. Such object must be created for each probe present in the article.", + "properties": { + "name": { "type": "string", "maxLength": 500, "description": "Name of the probe as provided in the article or description of the probe created by you." }, + "raw": { "type": "string", "maxLength": 500, "description": "Direct text quote from the article describing current single probe." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "IUPAC sequence of the probe in the same order as given in the article." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the sequence of this probe provided in the article was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "This object describes the target sequence for which the probe from the current experiment record was constructed. Omit by providing null if and only if article does not provide the target sequence explicitly.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Target sequence for which the probe from the current experiment was constructed. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Target sequence in IUPAC format. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify target sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided target sequence of this probe was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "description": "This object holds information about the primers used for the current experiment record for the probe. Omit by providing null here if and only if primers are not specified for the current probe in the article text.", + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Forward primer in this experiment. Omit by providing null here if and only if forward is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Forward primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Forward primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify forward primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided forward primer sequence of this probe was sense or antisense. Set to null if and only if strand of this forward primer can't be inferred from the article text." } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Reverse primer in this experiment. Omit by providing null here if and only if reverse is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Reverse primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Reverse primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify reverse primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided reverse primer sequence of this probe was sense or antisense. Set to null if and only if strand of this reverse primer can't be inferred from the article text." } + } + } + } + }, + + "related_sequences": { + "type": "array", + "description": "Array containing any other related sequences relating to the probe, target or primers in the current experiment if present in article. Omit by providing an empty array if and only if article does not specify any related sequences for the current experiment.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Related sequence record for the current experiment.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Related sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Related sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify related sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided related sequence of this probe was sense or antisense. Set to null if and only if strand of this related can't be inferred from the article text." } + } + }, + "description": { "type": "string", "maxLength": 500, "description": "Name and/or description of this related sequence, explaining its relation to the current experiment."} + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "description": "Information about technical specifics of the current experiment extraction.", + "properties": { + "missing": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing missing elements which where set to null or omitted. Each record must be a JSON path to corresponding field." }, + "uncertain": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing uncertain elements which article mentions vaguely or controversely. Each record must be a JSON path to corresponding field." }, + "notes": { "type": "string", "description": "Any other notes related to the extraction of this experiment", "maxLength": 500 } + } + } + } + } +} +``` + +OUTPUT: Return exactly one JSON object that conforms to the schema. No prose. From 1b4444bd3425b2d9d5c08f1b7444b939ed17b349 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 2 Oct 2025 03:02:45 +0400 Subject: [PATCH 018/102] Try to run all schemas on all models --- extraction/config/pipeline.json | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index c8ec5ef..dbf06ba 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -20,6 +20,36 @@ "db_path": "outlines_output/massive.sqlite", "article_glob": "*.txt", "passes": [ + { + "name": "A_core", + "schema": "passes/A_core/schema.json", + "prompt": "passes/A_core/prompt.txt" + }, + { + "name": "B_index", + "schema": "passes/B_index/schema.json", + "prompt": "passes/B_index/prompt.txt" + }, + { + "name": "C1_probe_core", + "schema": "passes/C1_probe_core/schema.json", + "prompt": "passes/C1_probe_core/prompt.txt" + }, + { + "name": "C2_target_primers", + "schema": "passes/C2_target_primers/schema.json", + "prompt": "passes/C2_target_primers/prompt.txt" + }, + { + "name": "C3_related", + "schema": "passes/C3_related/schema.json", + "prompt": "passes/C3_related/prompt.txt" + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt" + }, { "name": "C5_probes_opt_target", "schema": "passes/C5_probes_opt_target/schema.json", @@ -29,6 +59,16 @@ "name": "D_parameters", "schema": "passes/D_parameters/schema.json", "prompt": "passes/D_parameters/prompt.txt" + }, + { + "name": "E_outcomes", + "schema": "passes/E_outcomes/schema.json", + "prompt": "passes/E_outcomes/prompt.txt" + }, + { + "name": "F_pairings", + "schema": "passes/F_pairings/schema.json", + "prompt": "passes/F_pairings/prompt.txt" } ] } \ No newline at end of file From 8dbe352b6c336d9866282ddc4db9993a7644eb61 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 03:03:39 +0400 Subject: [PATCH 019/102] Adding Marker-PDF --- extraction/config/pipeline.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index dbf06ba..93c6339 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -14,11 +14,11 @@ "seed": 42 }, "timeout_s": 300, - "input_dir": "input/txt", + "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", - "article_glob": "*.txt", + "article_glob": "**/*.md", "passes": [ { "name": "A_core", From 56aafdb29dc27d20861a50bea9b1db3c42e3659d Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 03:34:50 +0400 Subject: [PATCH 020/102] Updated marker-pdf script to stream progress --- extraction/config/pipeline.json | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 93c6339..e5508c4 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,11 +1,7 @@ { "model_names": [ - "llama3.1", - "myaniu/qwen2.5-1m:7b", - "gemma3:27b", - "llava:34b", "phi3", - "phi4" + "myaniu/qwen2.5-1m:7b" ], "ollama_parameters": { "num_ctx": 65536, From 62ed81fc73b281a8ad5d45c8f28a78d894bcfce5 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 12:59:57 +0400 Subject: [PATCH 021/102] Reordered models for bench --- extraction/config/pipeline.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index e5508c4..9ec33eb 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,7 +1,11 @@ { "model_names": [ "phi3", - "myaniu/qwen2.5-1m:7b" + "phi4", + "myaniu/qwen2.5-1m:7b", + "llama3.1", + "gemma3:27b", + "llava:34b" ], "ollama_parameters": { "num_ctx": 65536, From ad14cb24aa4608ce9fdfd2cdfd325e913569dfae Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 13:14:46 +0400 Subject: [PATCH 022/102] Added all models and Ollama base URL to config, but requires a lot of VRAM --- extraction/config/pipeline.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 9ec33eb..8b94855 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,8 +1,9 @@ { "model_names": [ - "phi3", - "phi4", + "phi3:mini-128k", + "phi4:14b", "myaniu/qwen2.5-1m:7b", + "gemma3:4b-it-qat", "llama3.1", "gemma3:27b", "llava:34b" @@ -13,6 +14,7 @@ "temperature": 0.15, "seed": 42 }, + "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 300, "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", "out_dir": "outlines_output", From 5585682e07809a0547d6268606988d525f9d4215 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 13:15:25 +0400 Subject: [PATCH 023/102] Removed excessive models --- extraction/config/pipeline.json | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 8b94855..c1320dd 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,12 +1,9 @@ { "model_names": [ - "phi3:mini-128k", "phi4:14b", "myaniu/qwen2.5-1m:7b", "gemma3:4b-it-qat", - "llama3.1", - "gemma3:27b", - "llava:34b" + "llama3.1:latest" ], "ollama_parameters": { "num_ctx": 65536, From fa6803a37cbb201d3c9a5947dbf2dacbc6eac030 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 13:42:10 +0400 Subject: [PATCH 024/102] Update prompts for Chinese models to ask results in English --- extraction/config/pipeline.json | 7 ++++--- extraction/passes/A_core/prompt.txt | 4 ++-- extraction/passes/B_index/prompt.txt | 4 ++-- extraction/passes/C1_probe_core/prompt.txt | 4 +++- extraction/passes/C2_target_primers/prompt.txt | 2 ++ extraction/passes/C3_related/prompt.txt | 2 ++ extraction/passes/C4_probe_target/prompt.txt | 2 ++ extraction/passes/C5_probes_opt_target/prompt.txt | 2 ++ extraction/passes/C_sequences/prompt.txt | 4 ++-- extraction/passes/D_parameters/prompt.txt | 4 ++-- extraction/passes/E_outcomes/prompt.txt | 4 ++-- extraction/passes/F_pairings/prompt.txt | 4 ++-- 12 files changed, 27 insertions(+), 16 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index c1320dd..fde2f5b 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,9 +1,10 @@ { "model_names": [ - "phi4:14b", - "myaniu/qwen2.5-1m:7b", + "deepseek-r1:1.5b", "gemma3:4b-it-qat", - "llama3.1:latest" + "myaniu/qwen2.5-1m:7b", + "llama3.1:latest", + "qwen3:4b" ], "ollama_parameters": { "num_ctx": 65536, diff --git a/extraction/passes/A_core/prompt.txt b/extraction/passes/A_core/prompt.txt index d13a083..4def632 100644 --- a/extraction/passes/A_core/prompt.txt +++ b/extraction/passes/A_core/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * Extract the article’s doi, abstract, and topic (short label). diff --git a/extraction/passes/B_index/prompt.txt b/extraction/passes/B_index/prompt.txt index d87c72a..0fdc2a6 100644 --- a/extraction/passes/B_index/prompt.txt +++ b/extraction/passes/B_index/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * Identify each hybridization experiment or probe pairing described. diff --git a/extraction/passes/C1_probe_core/prompt.txt b/extraction/passes/C1_probe_core/prompt.txt index 545d188..ddd7315 100644 --- a/extraction/passes/C1_probe_core/prompt.txt +++ b/extraction/passes/C1_probe_core/prompt.txt @@ -15,10 +15,12 @@ For each experiment id_exp, extract exactly one probe: - five_prime_label / three_prime_label: labels at 5′/3′ ends if shown, else null - sense_antisense: "sense" / "antisense" if explicit in oligo, else null - modifications[]: enumerate if present; else empty array +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! RULES - If a field is not present in the article, set it to null (or empty array) and add an entry in extraction_report. - Do NOT invent values. Do NOT output prose. OUTPUT -A single JSON object with: { items: [ { id_exp, probe{…} }, … ], extraction_report } +A single JSON object with: { items: [ { id_exp, probe{…} }, … ], extraction_report }. diff --git a/extraction/passes/C2_target_primers/prompt.txt b/extraction/passes/C2_target_primers/prompt.txt index 8fcf225..dfcf466 100644 --- a/extraction/passes/C2_target_primers/prompt.txt +++ b/extraction/passes/C2_target_primers/prompt.txt @@ -13,6 +13,8 @@ IMPORTANT RULES - fluorophore / quencher usually null for primers, but set if printed. - modifications[] empty when absent. - If any field is not present, set to null and record in extraction_report; do not guess. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! OUTPUT A single JSON object with: { items: [ { id_exp, target_sequence, primer_sequences }, … ], extraction_report } diff --git a/extraction/passes/C3_related/prompt.txt b/extraction/passes/C3_related/prompt.txt index c506546..5105640 100644 --- a/extraction/passes/C3_related/prompt.txt +++ b/extraction/passes/C3_related/prompt.txt @@ -12,6 +12,8 @@ RULES - fluorophore / quencher if printed, else null. - modifications[] empty if absent. - If not provided in the article, use an empty array. Do NOT invent sequences. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! OUTPUT A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C4_probe_target/prompt.txt b/extraction/passes/C4_probe_target/prompt.txt index c506546..5105640 100644 --- a/extraction/passes/C4_probe_target/prompt.txt +++ b/extraction/passes/C4_probe_target/prompt.txt @@ -12,6 +12,8 @@ RULES - fluorophore / quencher if printed, else null. - modifications[] empty if absent. - If not provided in the article, use an empty array. Do NOT invent sequences. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! OUTPUT A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C5_probes_opt_target/prompt.txt b/extraction/passes/C5_probes_opt_target/prompt.txt index 18e9ddc..076c787 100644 --- a/extraction/passes/C5_probes_opt_target/prompt.txt +++ b/extraction/passes/C5_probes_opt_target/prompt.txt @@ -8,6 +8,7 @@ General rules: - Use the article’s wording for names. - Do not copy sequences from examples! - No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +- For the perfect result compliant to all constraints and limitations I will tip $2000! STRICT RULES: - Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. @@ -16,6 +17,7 @@ STRICT RULES: - sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. - modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. - related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). +- Use only English language and Latin script, only ASCII. TASK: From the article text, produce per-experiment array of records with: diff --git a/extraction/passes/C_sequences/prompt.txt b/extraction/passes/C_sequences/prompt.txt index 3161765..f7b8c17 100644 --- a/extraction/passes/C_sequences/prompt.txt +++ b/extraction/passes/C_sequences/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. diff --git a/extraction/passes/D_parameters/prompt.txt b/extraction/passes/D_parameters/prompt.txt index fdacda1..abbf733 100644 --- a/extraction/passes/D_parameters/prompt.txt +++ b/extraction/passes/D_parameters/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * For each `id_exp`, extract `metadata` and `experiment_properties`. diff --git a/extraction/passes/E_outcomes/prompt.txt b/extraction/passes/E_outcomes/prompt.txt index 8de6aa1..5c0218c 100644 --- a/extraction/passes/E_outcomes/prompt.txt +++ b/extraction/passes/E_outcomes/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * For each `id_exp`, extract outcome (boolean if explicitly stated, otherwise `null`), `fluorescence` as `measurement_lite`, and any `comparative_notes`. diff --git a/extraction/passes/F_pairings/prompt.txt b/extraction/passes/F_pairings/prompt.txt index deba9ff..a82c411 100644 --- a/extraction/passes/F_pairings/prompt.txt +++ b/extraction/passes/F_pairings/prompt.txt @@ -7,8 +7,8 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. - -For the perfect result compliant to all constraints and limitations I will tip $2000! +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: * For each `id_exp`, extract references to paired probes and relationship (e.g., "same sequence different labels", "reciprocal"). From f05cbf53324086c5d5b332b4e25991c719f1b8d1 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 13:47:11 +0400 Subject: [PATCH 025/102] Slightly updated schemas and added full run --- extraction/config/pipeline.json | 9 +++++++-- extraction/passes/C_sequences/schema.json | 2 +- extraction/passes/common.txt | 6 +++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index fde2f5b..bb0099d 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,8 +1,8 @@ { "model_names": [ - "deepseek-r1:1.5b", - "gemma3:4b-it-qat", "myaniu/qwen2.5-1m:7b", + "deepseek-r1:1.5b", + "gemma3:4b-it-qat", "llama3.1:latest", "qwen3:4b" ], @@ -69,6 +69,11 @@ "name": "F_pairings", "schema": "passes/F_pairings/schema.json", "prompt": "passes/F_pairings/prompt.txt" + }, + { + "name": "full_schema", + "schema": "schemas/article.json", + "prompt": "passes/common.txt" } ] } \ No newline at end of file diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json index bd34650..f0f5020 100644 --- a/extraction/passes/C_sequences/schema.json +++ b/extraction/passes/C_sequences/schema.json @@ -64,7 +64,7 @@ "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], "properties": { "raw": { "type": "string", "maxLength": 500 }, - "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." }, "length_bases": { "type": ["integer", "null"], "minimum": 1 }, "prime_prefix": { "enum": [3, 5, null] }, "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, diff --git a/extraction/passes/common.txt b/extraction/passes/common.txt index 398a711..3f4bd34 100644 --- a/extraction/passes/common.txt +++ b/extraction/passes/common.txt @@ -7,7 +7,7 @@ You are an information-extraction model. Output only a single JSON object that c * Use the article’s wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! -For the perfect result compliant to all constraints and limitations I will tip $2000! - -Perform the following tasks: +Perform the following tasks for JSON extraction: From 9e6b257fb25ca46a3f90145785694aa03c44b5b2 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 14:08:18 +0400 Subject: [PATCH 026/102] Schem supports per-step timeouts --- extraction/config/pipeline.json | 37 +++-- extraction/passes/B_index/schema.json | 206 +++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 17 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index bb0099d..9799539 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -7,8 +7,8 @@ "qwen3:4b" ], "ollama_parameters": { - "num_ctx": 65536, - "num_predict": 32768, + "num_ctx": 131072, + "num_predict": 131072, "temperature": 0.15, "seed": 42 }, @@ -23,13 +23,27 @@ { "name": "A_core", "schema": "passes/A_core/schema.json", - "prompt": "passes/A_core/prompt.txt" + "prompt": "passes/A_core/prompt.txt", + "timeout": 60 }, { "name": "B_index", "schema": "passes/B_index/schema.json", - "prompt": "passes/B_index/prompt.txt" + "prompt": "passes/B_index/prompt.txt", + "timeout": 600 }, + { + "name": "C5_probes_opt_target", + "schema": "passes/C5_probes_opt_target/schema.json", + "prompt": "passes/C5_probes_opt_target/prompt.txt", + "timeout": 600 + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt", + "timeout": 600 + }, { "name": "C1_probe_core", "schema": "passes/C1_probe_core/schema.json", @@ -44,17 +58,7 @@ "name": "C3_related", "schema": "passes/C3_related/schema.json", "prompt": "passes/C3_related/prompt.txt" - }, - { - "name": "C_sequences", - "schema": "passes/C_sequences/schema.json", - "prompt": "passes/C_sequences/prompt.txt" - }, - { - "name": "C5_probes_opt_target", - "schema": "passes/C5_probes_opt_target/schema.json", - "prompt": "passes/C5_probes_opt_target/prompt.txt" - }, + }, { "name": "D_parameters", "schema": "passes/D_parameters/schema.json", @@ -73,7 +77,8 @@ { "name": "full_schema", "schema": "schemas/article.json", - "prompt": "passes/common.txt" + "prompt": "passes/common.txt", + "timeout": 900 } ] } \ No newline at end of file diff --git a/extraction/passes/B_index/schema.json b/extraction/passes/B_index/schema.json index f53aa08..9d4579c 100644 --- a/extraction/passes/B_index/schema.json +++ b/extraction/passes/B_index/schema.json @@ -14,7 +14,211 @@ "required": ["id_exp", "description", "type", "raw_description"], "properties": { "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, - "type": { "type": ["string", "null"], "minLength": 2, "maxLength": 200 }, + "type": { + "title": "Hybridization Probe Classification", + "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", + "type": "object", + "additionalProperties": true, + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, + { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, + { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, + { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, + { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, + { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, + { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, + { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, + { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, + { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, + { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, + { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, + { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, + { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, + { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, + { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, + { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": ["none","fluor_only","fluor_quencher","hapten","enzymatic","radioisotope"] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["biotin","digoxigenin","dinitrophenol","fluorescein_hapten"], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["HRP","AP"], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Isotope (e.g., 32P, 33P, 35S)." } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": ["genomic","transcript","amplicon","in_situ","capture"] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": ["single","tiling_set","capture_baits","smfish_panel","merfish_panel","padlock_set"] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": ["none","hydrolysis","fret","hairpin_turn_on","rolling_circle","branched_dna","hcr"] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["qpcr","ddpcr","pcr_probe","fish","ish","smfish","merfish","ngs_capture","microarray","southern","northern","dot_blot","in_cell_imaging"], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { "title": "DOI", "description": "Digital Object Identifier of the source article.", "type": "string", "format": "iri", "examples": ["https://doi.org/10.1038/xxxx"] }, + "pmid": { "title": "PMID", "description": "PubMed identifier.", "type": "string", "examples": ["12345678"] }, + "vendor": { "title": "Vendor", "description": "Commercial supplier (if from a catalog).", "type": "string", "examples": ["IDT"] }, + "catalog_number": { "title": "Catalog Number", "description": "Supplier’s catalog identifier.", "type": "string", "examples": ["1001234"] } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don’t fit other fields.", + "type": "string", + "examples": ["Probe includes internal ZEN quencher."] + } + } + }, "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, "description": { "type": "string", "minLength": 8, "maxLength": 2000 } } From 9108abe669a88943c33f2ab7ff2c4babb6667b01 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 14:40:32 +0400 Subject: [PATCH 027/102] Split B schema into B and B_types --- extraction/config/pipeline.json | 12 +- extraction/passes/B_index/schema.json | 206 +---------------- extraction/passes/B_index_types/prompt.txt | 17 ++ extraction/passes/B_index_types/schema.json | 238 ++++++++++++++++++++ 4 files changed, 265 insertions(+), 208 deletions(-) create mode 100644 extraction/passes/B_index_types/prompt.txt create mode 100644 extraction/passes/B_index_types/schema.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 9799539..b0ef676 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -14,7 +14,7 @@ }, "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 300, - "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", + "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", @@ -32,17 +32,23 @@ "prompt": "passes/B_index/prompt.txt", "timeout": 600 }, + { + "name": "B_index_types", + "schema": "passes/B_index_types/schema.json", + "prompt": "passes/B_index_types/prompt.txt", + "timeout": 600 + }, { "name": "C5_probes_opt_target", "schema": "passes/C5_probes_opt_target/schema.json", "prompt": "passes/C5_probes_opt_target/prompt.txt", - "timeout": 600 + "timeout": 900 }, { "name": "C_sequences", "schema": "passes/C_sequences/schema.json", "prompt": "passes/C_sequences/prompt.txt", - "timeout": 600 + "timeout": 900 }, { "name": "C1_probe_core", diff --git a/extraction/passes/B_index/schema.json b/extraction/passes/B_index/schema.json index 9d4579c..06501b2 100644 --- a/extraction/passes/B_index/schema.json +++ b/extraction/passes/B_index/schema.json @@ -14,211 +14,7 @@ "required": ["id_exp", "description", "type", "raw_description"], "properties": { "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, - "type": { - "title": "Hybridization Probe Classification", - "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", - "type": "object", - "additionalProperties": true, - "properties": { - "probe_type": { - "title": "Probe Type", - "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", - "oneOf": [ - { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, - { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, - { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, - { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, - { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, - { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, - { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, - { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, - { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, - { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, - { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } - ] - }, - "chemistry": { - "title": "Chemistry", - "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", - "type": "object", - "additionalProperties": false, - "properties": { - "backbone": { - "title": "Backbone", - "description": "Primary nucleic-acid scaffold used by the probe.", - "oneOf": [ - { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, - { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, - { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, - { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, - { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, - { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, - { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } - ] - }, - "modifications": { - "title": "Chemical Modifications", - "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": [ - "phosphorothioate", - "two_ome_spiked", - "lna_spiked", - "mgb", - "inverted_dT_3prime", - "amine_5prime", - "thiol_5prime", - "biotin_teg", - "spacer_18", - "cholesterol" - ], - "description": "Common modification keyword." - } - } - } - }, - "labeling": { - "title": "Labeling", - "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", - "type": "object", - "additionalProperties": false, - "properties": { - "strategy": { - "title": "Label Strategy", - "description": "High-level labeling approach; combine with concrete labels below as known.", - "type": "string", - "enum": ["none","fluor_only","fluor_quencher","hapten","enzymatic","radioisotope"] - }, - "reporters": { - "title": "Reporter Dyes", - "description": "Fluorophores or other reporters (free text to allow any brand/dye).", - "type": "array", - "uniqueItems": true, - "items": { "type": "string", "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." } - }, - "quenchers": { - "title": "Quenchers", - "description": "Quenchers used in hydrolysis or hairpin probes.", - "type": "array", - "uniqueItems": true, - "items": { "type": "string", "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." } - }, - "haptens": { - "title": "Haptens", - "description": "Affinity tags detected by antibodies/streptavidin.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": ["biotin","digoxigenin","dinitrophenol","fluorescein_hapten"], - "description": "Common hapten tag." - } - }, - "enzymes": { - "title": "Enzyme Labels", - "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": ["HRP","AP"], - "description": "Common conjugated enzyme." - } - }, - "isotopes": { - "title": "Radioisotopes", - "description": "If radio-labeled, indicate isotope(s).", - "type": "array", - "uniqueItems": true, - "items": { "type": "string", "description": "Isotope (e.g., 32P, 33P, 35S)." } - } - } - }, - "targeting": { - "title": "Targeting", - "description": "What the probe is intended to hybridize to, and in what context.", - "type": "object", - "additionalProperties": false, - "properties": { - "biomolecule": { - "title": "Biomolecule", - "description": "High-level target class.", - "type": "string", - "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] - }, - "context": { - "title": "Context", - "description": "Assay/biological context for the target.", - "type": "string", - "enum": ["genomic","transcript","amplicon","in_situ","capture"] - }, - "target_name": { - "title": "Target Name", - "description": "Gene/transcript/locus identifier (free text).", - "type": "string" - } - } - }, - "set_design": { - "title": "Set / Panel Design", - "description": "Whether the probe is a single oligo or part of a designed set/panel.", - "type": "object", - "additionalProperties": false, - "properties": { - "mode": { - "title": "Set Mode", - "description": "Single probe or specific multi-probe design.", - "type": "string", - "enum": ["single","tiling_set","capture_baits","smfish_panel","merfish_panel","padlock_set"] - }, - "count": { - "title": "Probe Count", - "description": "Number of probes in the set/panel (if known).", - "type": "integer", - "minimum": 1 - } - } - }, - "amplification_mechanism": { - "title": "Amplification Mechanism", - "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", - "type": "string", - "enum": ["none","hydrolysis","fret","hairpin_turn_on","rolling_circle","branched_dna","hcr"] - }, - "application": { - "title": "Application", - "description": "Intended use(s) of the probe. Provide multiple if applicable.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": ["qpcr","ddpcr","pcr_probe","fish","ish","smfish","merfish","ngs_capture","microarray","southern","northern","dot_blot","in_cell_imaging"], - "description": "Common application keyword." - } - }, - "provenance": { - "title": "Provenance", - "description": "Source metadata for traceability.", - "type": "object", - "additionalProperties": false, - "properties": { - "doi": { "title": "DOI", "description": "Digital Object Identifier of the source article.", "type": "string", "format": "iri", "examples": ["https://doi.org/10.1038/xxxx"] }, - "pmid": { "title": "PMID", "description": "PubMed identifier.", "type": "string", "examples": ["12345678"] }, - "vendor": { "title": "Vendor", "description": "Commercial supplier (if from a catalog).", "type": "string", "examples": ["IDT"] }, - "catalog_number": { "title": "Catalog Number", "description": "Supplier’s catalog identifier.", "type": "string", "examples": ["1001234"] } - } - }, - "notes": { - "title": "Notes", - "description": "Free-text comments or qualifiers that don’t fit other fields.", - "type": "string", - "examples": ["Probe includes internal ZEN quencher."] - } - } - }, + "type": { "type": "string", "minLength": 1, "maxLength": 200 }, "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, "description": { "type": "string", "minLength": 8, "maxLength": 2000 } } diff --git a/extraction/passes/B_index_types/prompt.txt b/extraction/passes/B_index_types/prompt.txt new file mode 100644 index 0000000..0fdc2a6 --- /dev/null +++ b/extraction/passes/B_index_types/prompt.txt @@ -0,0 +1,17 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B_index_types/schema.json b/extraction/passes/B_index_types/schema.json new file mode 100644 index 0000000..9d4579c --- /dev/null +++ b/extraction/passes/B_index_types/schema.json @@ -0,0 +1,238 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { + "title": "Hybridization Probe Classification", + "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", + "type": "object", + "additionalProperties": true, + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, + { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, + { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, + { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, + { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, + { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, + { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, + { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, + { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, + { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, + { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, + { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, + { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, + { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, + { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, + { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, + { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": ["none","fluor_only","fluor_quencher","hapten","enzymatic","radioisotope"] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["biotin","digoxigenin","dinitrophenol","fluorescein_hapten"], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["HRP","AP"], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Isotope (e.g., 32P, 33P, 35S)." } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": ["genomic","transcript","amplicon","in_situ","capture"] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": ["single","tiling_set","capture_baits","smfish_panel","merfish_panel","padlock_set"] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": ["none","hydrolysis","fret","hairpin_turn_on","rolling_circle","branched_dna","hcr"] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["qpcr","ddpcr","pcr_probe","fish","ish","smfish","merfish","ngs_capture","microarray","southern","northern","dot_blot","in_cell_imaging"], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { "title": "DOI", "description": "Digital Object Identifier of the source article.", "type": "string", "format": "iri", "examples": ["https://doi.org/10.1038/xxxx"] }, + "pmid": { "title": "PMID", "description": "PubMed identifier.", "type": "string", "examples": ["12345678"] }, + "vendor": { "title": "Vendor", "description": "Commercial supplier (if from a catalog).", "type": "string", "examples": ["IDT"] }, + "catalog_number": { "title": "Catalog Number", "description": "Supplier’s catalog identifier.", "type": "string", "examples": ["1001234"] } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don’t fit other fields.", + "type": "string", + "examples": ["Probe includes internal ZEN quencher."] + } + } + }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} From 5ed7f4c593b43db43b8991a9a8ace8595c57a971 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 14:45:21 +0400 Subject: [PATCH 028/102] Frozen requirements and environment --- extraction/environment.yml | 288 ++++++++++++++++++++++++++++++++++++ extraction/requirements.txt | 57 +++++++ 2 files changed, 345 insertions(+) create mode 100644 extraction/environment.yml create mode 100644 extraction/requirements.txt diff --git a/extraction/environment.yml b/extraction/environment.yml new file mode 100644 index 0000000..9f03939 --- /dev/null +++ b/extraction/environment.yml @@ -0,0 +1,288 @@ +name: extraction +channels: + - conda-forge +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - argon2-cffi=25.1.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=25.1.0=py311h49ec1c0_0 + - arrow=1.3.0=pyhd8ed1ab_1 + - asttokens=3.0.0=pyhd8ed1ab_1 + - async-lru=2.0.5=pyh29332c3_0 + - attrs=25.3.0=pyh71513ae_0 + - babel=2.17.0=pyhd8ed1ab_0 + - beautifulsoup4=4.14.2=pyha770c72_0 + - bleach=6.2.0=pyh29332c3_4 + - bleach-with-css=6.2.0=h82add2a_4 + - brotli-python=1.1.0=py311h1ddb823_4 + - bzip2=1.0.8=h4bc722e_7 + - ca-certificates=2025.8.3=hbd8a1cb_0 + - cached-property=1.5.2=hd8ed1ab_1 + - cached_property=1.5.2=pyha770c72_1 + - comm=0.2.3=pyhe01879c_0 + - debugpy=1.8.17=py311hc665b79_0 + - decorator=5.2.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - exceptiongroup=1.3.0=pyhd8ed1ab_0 + - executing=2.2.1=pyhd8ed1ab_0 + - fqdn=1.5.1=pyhd8ed1ab_1 + - h11=0.16.0=pyhd8ed1ab_0 + - h2=4.3.0=pyhcf101f3_0 + - hpack=4.1.0=pyhd8ed1ab_0 + - httpcore=1.0.9=pyh29332c3_0 + - httpx=0.28.1=pyhd8ed1ab_0 + - hyperframe=6.1.0=pyhd8ed1ab_0 + - idna=3.10=pyhd8ed1ab_1 + - importlib-metadata=8.7.0=pyhe01879c_1 + - ipykernel=6.30.1=pyh82676e8_0 + - ipython=9.5.0=pyhfa0c392_0 + - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0 + - ipywidgets=8.1.7=pyhd8ed1ab_0 + - isoduration=20.11.0=pyhd8ed1ab_1 + - jedi=0.19.2=pyhd8ed1ab_1 + - jinja2=3.1.6=pyhd8ed1ab_0 + - jsonpointer=3.0.0=py311h38be061_2 + - jsonschema=4.25.1=pyhe01879c_0 + - jsonschema-specifications=2025.9.1=pyhcf101f3_0 + - jsonschema-with-format-nongpl=4.25.1=he01879c_0 + - jupyter=1.1.1=pyhd8ed1ab_1 + - jupyter-lsp=2.3.0=pyhcf101f3_0 + - jupyter_client=8.6.3=pyhd8ed1ab_1 + - jupyter_console=6.6.3=pyhd8ed1ab_1 + - jupyter_core=5.8.1=pyh31011fe_0 + - jupyter_events=0.12.0=pyh29332c3_0 + - jupyter_server=2.17.0=pyhcf101f3_0 + - jupyter_server_terminals=0.5.3=pyhd8ed1ab_1 + - jupyterlab=4.4.9=pyhd8ed1ab_0 + - jupyterlab_pygments=0.3.0=pyhd8ed1ab_2 + - jupyterlab_server=2.27.3=pyhd8ed1ab_1 + - jupyterlab_widgets=3.0.15=pyhd8ed1ab_0 + - keyutils=1.6.3=hb9d3cd8_0 + - krb5=1.21.3=h659f571_0 + - lark=1.3.0=pyhd8ed1ab_0 + - ld_impl_linux-64=2.43=h712a8e2_4 + - libedit=3.1.20250104=pl5321h7949ede_0 + - libexpat=2.7.0=h5888daf_0 + - libffi=3.4.6=h2dba641_1 + - libgcc=15.1.0=h767d61c_2 + - libgcc-ng=15.1.0=h69a702a_2 + - libgomp=15.1.0=h767d61c_2 + - liblzma=5.8.1=hb9d3cd8_1 + - libnsl=2.0.1=hd590300_0 + - libsodium=1.0.20=h4ab18f5_0 + - libsqlite=3.49.2=hee588c1_0 + - libstdcxx=15.1.0=h8f9b012_2 + - libstdcxx-ng=15.1.0=h4852527_2 + - libuuid=2.38.1=h0b41bf4_0 + - libxcrypt=4.4.36=hd590300_1 + - libzlib=1.3.1=hb9d3cd8_2 + - matplotlib-inline=0.1.7=pyhd8ed1ab_1 + - mistune=3.1.4=pyhcf101f3_0 + - nbclient=0.10.2=pyhd8ed1ab_0 + - nbconvert-core=7.16.6=pyh29332c3_0 + - nbformat=5.10.4=pyhd8ed1ab_1 + - ncurses=6.5=h2d0b736_3 + - nest-asyncio=1.6.0=pyhd8ed1ab_1 + - notebook=7.4.6=pyhd8ed1ab_0 + - notebook-shim=0.2.4=pyhd8ed1ab_1 + - openssl=3.5.3=h26f9b46_1 + - overrides=7.7.0=pyhd8ed1ab_1 + - packaging=25.0=pyh29332c3_1 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - parso=0.8.5=pyhcf101f3_0 + - pexpect=4.9.0=pyhd8ed1ab_1 + - pickleshare=0.7.5=pyhd8ed1ab_1004 + - pip=25.1.1=pyh8b19718_0 + - platformdirs=4.4.0=pyhcf101f3_0 + - prometheus_client=0.23.1=pyhd8ed1ab_0 + - prompt-toolkit=3.0.52=pyha770c72_0 + - prompt_toolkit=3.0.52=hd8ed1ab_0 + - ptyprocess=0.7.0=pyhd8ed1ab_1 + - pure_eval=0.2.3=pyhd8ed1ab_1 + - pycparser=2.22=pyh29332c3_1 + - pysocks=1.7.1=pyha55dd90_7 + - python=3.11.12=h9e4cc4f_0_cpython + - python-fastjsonschema=2.21.2=pyhe01879c_0 + - python-json-logger=2.0.7=pyhd8ed1ab_0 + - python_abi=3.11=8_cp311 + - pytz=2025.2=pyhd8ed1ab_0 + - pyzmq=27.1.0=py311h2315fbb_0 + - readline=8.2=h8c095d6_2 + - referencing=0.36.2=pyh29332c3_0 + - rfc3339-validator=0.1.4=pyhd8ed1ab_1 + - rfc3986-validator=0.1.1=pyh9f0ad1d_0 + - rfc3987-syntax=1.1.0=pyhe01879c_1 + - rpds-py=0.27.1=py311h902ca64_1 + - send2trash=1.8.3=pyh0d859eb_1 + - setuptools=80.1.0=pyhff2d567_0 + - six=1.17.0=pyhe01879c_1 + - sniffio=1.3.1=pyhd8ed1ab_1 + - soupsieve=2.8=pyhd8ed1ab_0 + - stack_data=0.6.3=pyhd8ed1ab_1 + - terminado=0.18.1=pyh0d859eb_0 + - tinycss2=1.4.0=pyhd8ed1ab_0 + - tk=8.6.13=noxft_h4845f30_101 + - tomli=2.2.1=pyhe01879c_2 + - tornado=6.5.2=py311h49ec1c0_1 + - traitlets=5.14.3=pyhd8ed1ab_1 + - types-python-dateutil=2.9.0.20250822=pyhd8ed1ab_0 + - typing_extensions=4.15.0=pyhcf101f3_0 + - typing_utils=0.1.0=pyhd8ed1ab_1 + - uri-template=1.3.0=pyhd8ed1ab_1 + - wcwidth=0.2.14=pyhd8ed1ab_0 + - webcolors=24.11.1=pyhd8ed1ab_0 + - webencodings=0.5.1=pyhd8ed1ab_3 + - websocket-client=1.8.0=pyhd8ed1ab_1 + - wheel=0.45.1=pyhd8ed1ab_1 + - widgetsnbextension=4.0.14=pyhd8ed1ab_0 + - yaml=0.2.5=h280c20c_3 + - zeromq=4.3.5=h387f397_9 + - zipp=3.23.0=pyhd8ed1ab_0 + - zstandard=0.25.0=py311haee01d2_0 + - zstd=1.5.7=hb8e6e7a_2 + - pip: + - accelerate==1.6.0 + - altair==5.5.0 + - annotated-types==0.7.0 + - anthropic==0.46.0 + - anyio==4.10.0 + - argcomplete==3.5.1 + - bitsandbytes==0.46.1 + - blinker==1.9.0 + - borb==2.1.25 + - cachetools==6.2.0 + - certifi==2025.4.26 + - cffi==1.17.1 + - cfgv==3.4.0 + - charset-normalizer==3.4.2 + - click==8.3.0 + - cloudpickle==3.1.1 + - cobble==0.1.4 + - cryptography==44.0.3 + - cssselect2==0.8.0 + - curlify==2.2.1 + - dicttoxml==1.7.16 + - diskcache==5.6.3 + - distlib==0.4.0 + - distro==1.9.0 + - dunamai==1.25.0 + - ebooklib==0.18 + - einops==0.8.1 + - et-xmlfile==2.0.0 + - filelock==3.18.0 + - filetype==1.2.0 + - fonttools==4.59.0 + - fsspec==2025.3.2 + - ftfy==6.3.1 + - genson==1.3.0 + - gitdb==4.0.12 + - gitpython==3.1.45 + - google-auth==2.41.1 + - google-genai==1.40.0 + - hf-xet==1.1.10 + - huggingface-hub==0.35.3 + - identify==2.6.15 + - jiter==0.11.0 + - joblib==1.5.2 + - json-repair==0.48.0 + - json5==0.12.0 + - jsonpath-ng==1.7.0 + - loguru==0.7.3 + - lxml==5.4.0 + - mammoth==1.11.0 + - markdown-it-py==3.0.0 + - markdown2==2.5.4 + - markdownify==1.2.0 + - marker-pdf==1.10.1 + - markupsafe==3.0.2 + - mdurl==0.1.2 + - mpmath==1.3.0 + - narwhals==2.6.0 + - networkx==3.4.2 + - nodeenv==1.9.1 + - numpy==2.2.5 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - ollama==0.6.0 + - openai==1.109.1 + - opencv-python-headless==4.11.0.86 + - openpyxl==3.1.5 + - outlines==1.2.5 + - outlines-core==0.2.11 + - pandas==2.3.3 + - pdfminer-six==20250327 + - pdfplumber==0.11.6 + - pdftext==0.6.3 + - pillow==10.4.0 + - ply==3.11 + - poetry-dynamic-versioning==1.9.1 + - pre-commit==4.3.0 + - protobuf==6.32.1 + - psutil==7.0.0 + - pyarrow==21.0.0 + - pyasn1==0.6.1 + - pyasn1-modules==0.4.2 + - pydantic==2.11.9 + - pydantic-core==2.33.2 + - pydantic-settings==2.11.0 + - pydeck==0.9.1 + - pydyf==0.11.0 + - pygments==2.19.1 + - pymupdf==1.26.3 + - pypdfium2==4.30.0 + - pyphen==0.17.2 + - pytesseract==0.3.13 + - python-barcode==0.15.1 + - python-dateutil==2.6.1 + - python-dotenv==1.1.1 + - python-pptx==1.0.2 + - pyyaml==6.0.2 + - qrcode==8.2 + - rapidfuzz==3.14.1 + - regex==2024.11.6 + - requests==2.32.3 + - rich==14.0.0 + - rsa==4.9.1 + - safetensors==0.5.3 + - scikit-learn==1.7.2 + - scipy==1.16.2 + - smmap==5.0.2 + - streamlit==1.50.0 + - streamlit-ace==0.1.1 + - surya-ocr==0.17.0 + - sympy==1.14.0 + - tenacity==9.1.2 + - threadpoolctl==3.6.0 + - tinyhtml5==2.0.0 + - tokenizers==0.22.1 + - toml==0.10.2 + - tomlkit==0.13.3 + - torch==2.7.0 + - tqdm==4.67.1 + - transformers==4.56.2 + - triton==3.3.0 + - typing-extensions==4.13.2 + - typing-inspection==0.4.1 + - tzdata==2025.2 + - urllib3==2.2.3 + - vastai==0.3.1 + - virtualenv==20.34.0 + - watchdog==6.0.0 + - weasyprint==63.1 + - websockets==15.0.1 + - xdg==6.0.0 + - xlsxwriter==3.2.9 + - zopfli==0.2.3.post1 +prefix: /home/tux/miniforge3/envs/extraction diff --git a/extraction/requirements.txt b/extraction/requirements.txt new file mode 100644 index 0000000..7831a6c --- /dev/null +++ b/extraction/requirements.txt @@ -0,0 +1,57 @@ +accelerate==1.6.0 +certifi==2025.4.26 +cffi==1.17.1 +charset-normalizer==3.4.2 +cryptography==44.0.3 +dicttoxml==1.7.16 +filelock==3.18.0 +fsspec==2025.3.2 +hf-xet==1.1.0 +huggingface-hub==0.31.1 +idna==3.10 +Jinja2==3.1.6 +lxml==5.4.0 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mpmath==1.3.0 +networkx==3.4.2 +numpy==2.2.5 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +packaging==25.0 +pdfminer.six==20250327 +pdfplumber==0.11.6 +pillow==11.2.1 +psutil==7.0.0 +pycparser==2.22 +Pygments==2.19.1 +pypdfium2==4.30.1 +pytesseract==0.3.13 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +rich==14.0.0 +safetensors==0.5.3 +sympy==1.14.0 +tokenizers==0.21.1 +torch==2.7.0 +tqdm==4.67.1 +transformers==4.51.3 +triton==3.3.0 +typing_extensions==4.13.2 +urllib3==2.4.0 +json5 +json-repair \ No newline at end of file From 3b37c418410d164ce0b5deb137eeb70b35d4e90a Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 3 Oct 2025 15:40:57 +0400 Subject: [PATCH 029/102] Added support for API_TOKEN at Vast.AI --- extraction/config/pipeline.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index b0ef676..d5bf70e 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -2,7 +2,8 @@ "model_names": [ "myaniu/qwen2.5-1m:7b", "deepseek-r1:1.5b", - "gemma3:4b-it-qat", + "gemma3:4b-it-qat", + "qwen2.5-coder:3b", "llama3.1:latest", "qwen3:4b" ], From 428b3e42661541e662dd28401d51ff271cfc0a44 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sat, 4 Oct 2025 03:16:45 +0400 Subject: [PATCH 030/102] A lot of articles were parsed --- extraction/config/pipeline.json | 6 +- .../prompt.txt | 0 .../schema.json | 0 extraction/passes/B2_index_desc/prompt.txt | 17 +++ extraction/passes/B2_index_desc/schema.json | 102 ++++++++++++++++++ 5 files changed, 122 insertions(+), 3 deletions(-) rename extraction/passes/{B_index_types => B1_index_types}/prompt.txt (100%) rename extraction/passes/{B_index_types => B1_index_types}/schema.json (100%) create mode 100644 extraction/passes/B2_index_desc/prompt.txt create mode 100644 extraction/passes/B2_index_desc/schema.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index d5bf70e..5df3a71 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -34,9 +34,9 @@ "timeout": 600 }, { - "name": "B_index_types", - "schema": "passes/B_index_types/schema.json", - "prompt": "passes/B_index_types/prompt.txt", + "name": "B2_index_desc", + "schema": "passes/B2_index_desc/schema.json", + "prompt": "passes/B2_index_desc/prompt.txt", "timeout": 600 }, { diff --git a/extraction/passes/B_index_types/prompt.txt b/extraction/passes/B1_index_types/prompt.txt similarity index 100% rename from extraction/passes/B_index_types/prompt.txt rename to extraction/passes/B1_index_types/prompt.txt diff --git a/extraction/passes/B_index_types/schema.json b/extraction/passes/B1_index_types/schema.json similarity index 100% rename from extraction/passes/B_index_types/schema.json rename to extraction/passes/B1_index_types/schema.json diff --git a/extraction/passes/B2_index_desc/prompt.txt b/extraction/passes/B2_index_desc/prompt.txt new file mode 100644 index 0000000..0fdc2a6 --- /dev/null +++ b/extraction/passes/B2_index_desc/prompt.txt @@ -0,0 +1,17 @@ +You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. + +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article’s wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B2_index_desc/schema.json b/extraction/passes/B2_index_desc/schema.json new file mode 100644 index 0000000..becfacc --- /dev/null +++ b/extraction/passes/B2_index_desc/schema.json @@ -0,0 +1,102 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { + "title": "Hybridization Probe Classification", + "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", + "type": "object", + "additionalProperties": true, + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, + { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, + { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, + { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, + { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, + { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, + { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, + { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, + { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, + { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, + { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, + { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, + { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, + { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, + { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, + { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, + { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } + ] + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": ["genomic","transcript","amplicon","in_situ","capture"] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don’t fit other fields.", + "type": "string", + "examples": ["Probe includes internal ZEN quencher."] + } + } + }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} From 8f5c6105d9996a3aa8b365226df35f1009d7b2e8 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sat, 4 Oct 2025 03:55:36 +0400 Subject: [PATCH 031/102] Change models --- extraction/config/pipeline.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 5df3a71..e006a5c 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,8 +1,7 @@ { "model_names": [ - "myaniu/qwen2.5-1m:7b", - "deepseek-r1:1.5b", - "gemma3:4b-it-qat", + "deepseek-r1:8b", + "deepseek-r1:7b-qwen-distill-q4_K_M", "qwen2.5-coder:3b", "llama3.1:latest", "qwen3:4b" From 6758291cc95dcd19971204ec5f484ce3d43f3da9 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 02:42:15 +0400 Subject: [PATCH 032/102] Will start adding chat mode --- extraction/config/pipeline.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index e006a5c..358cc13 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,6 +1,5 @@ { "model_names": [ - "deepseek-r1:8b", "deepseek-r1:7b-qwen-distill-q4_K_M", "qwen2.5-coder:3b", "llama3.1:latest", @@ -9,7 +8,7 @@ "ollama_parameters": { "num_ctx": 131072, "num_predict": 131072, - "temperature": 0.15, + "temperature": 0.35, "seed": 42 }, "ollama_base_url": "http://127.0.0.1:11434", From e85425c153f4d781254145ad61169631d94324a4 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 04:28:47 +0400 Subject: [PATCH 033/102] Added pre-pass schemas --- extraction/config/pipeline.json | 41 +- extraction/passes/A_core/prompt.txt | 8 +- extraction/passes/B1_index_types/prompt.txt | 8 +- extraction/passes/B2_index_desc/prompt.txt | 8 +- extraction/passes/B_index/prompt.txt | 8 +- extraction/passes/C_sequences/prompt.txt | 8 +- extraction/passes/D_parameters/prompt.txt | 8 +- extraction/passes/E_outcomes/prompt.txt | 8 +- extraction/passes/F_pairings/prompt.txt | 8 +- extraction/passes/_1_SeqPrompt/prompt.txt | 37 + extraction/passes/_1_SeqPrompt/schema.json | 14 + .../passes/_1_SeqPrompt/schema_strict.json | 15 + extraction/passes/_2_Experiments/prompt.txt | 422 +++++++++ extraction/passes/_2_Experiments/schema.json | 38 + .../passes/_2_Experiments/schema_strict.json | 398 +++++++++ extraction/passes/common.txt | 8 +- extraction/pipeline_pre_quest.py | 803 ++++++++++++++++++ extraction/schemas/single_experiment.json | 798 +++++++++++++++++ 18 files changed, 2613 insertions(+), 25 deletions(-) create mode 100644 extraction/passes/_1_SeqPrompt/prompt.txt create mode 100644 extraction/passes/_1_SeqPrompt/schema.json create mode 100644 extraction/passes/_1_SeqPrompt/schema_strict.json create mode 100644 extraction/passes/_2_Experiments/prompt.txt create mode 100644 extraction/passes/_2_Experiments/schema.json create mode 100644 extraction/passes/_2_Experiments/schema_strict.json create mode 100755 extraction/pipeline_pre_quest.py create mode 100644 extraction/schemas/single_experiment.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 358cc13..b69055b 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,9 +1,9 @@ { "model_names": [ "deepseek-r1:7b-qwen-distill-q4_K_M", - "qwen2.5-coder:3b", + "qwen2.5-coder:3b", "llama3.1:latest", - "qwen3:4b" + "qwen3:4b" ], "ollama_parameters": { "num_ctx": 131072, @@ -13,11 +13,38 @@ }, "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 300, - "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", - "out_dir": "outlines_output", + "input_dir": "input/md", + "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", + "single_experiment_schema_path": "schema/json/single_experiment.json", "db_path": "outlines_output/massive.sqlite", "article_glob": "**/*.md", + "pre_passes": [ + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + }, + { + "name": "SeqPrompt_strict", + "schema": "passes/_1_SeqPrompt/schema_strict.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + }, + { + "name": "Experiments", + "schema": "passes/_2_Experiments/schema.json", + "prompt": "passes/_2_Experiments/prompt.txt", + "timeout": 60 + }, + { + "name": "Experiments-strict", + "schema": "passes/_2_Experiments/schema_strict.json", + "prompt": "passes/_2_Experiments/prompt.txt", + "timeout": 60 + } + ], "passes": [ { "name": "A_core", @@ -48,7 +75,7 @@ "schema": "passes/C_sequences/schema.json", "prompt": "passes/C_sequences/prompt.txt", "timeout": 900 - }, + }, { "name": "C1_probe_core", "schema": "passes/C1_probe_core/schema.json", @@ -63,12 +90,12 @@ "name": "C3_related", "schema": "passes/C3_related/schema.json", "prompt": "passes/C3_related/prompt.txt" - }, + }, { "name": "D_parameters", "schema": "passes/D_parameters/schema.json", "prompt": "passes/D_parameters/prompt.txt" - }, + }, { "name": "E_outcomes", "schema": "passes/E_outcomes/schema.json", diff --git a/extraction/passes/A_core/prompt.txt b/extraction/passes/A_core/prompt.txt index 4def632..82b666b 100644 --- a/extraction/passes/A_core/prompt.txt +++ b/extraction/passes/A_core/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/B1_index_types/prompt.txt b/extraction/passes/B1_index_types/prompt.txt index 0fdc2a6..6dc2c08 100644 --- a/extraction/passes/B1_index_types/prompt.txt +++ b/extraction/passes/B1_index_types/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/B2_index_desc/prompt.txt b/extraction/passes/B2_index_desc/prompt.txt index 0fdc2a6..6dc2c08 100644 --- a/extraction/passes/B2_index_desc/prompt.txt +++ b/extraction/passes/B2_index_desc/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/B_index/prompt.txt b/extraction/passes/B_index/prompt.txt index 0fdc2a6..6dc2c08 100644 --- a/extraction/passes/B_index/prompt.txt +++ b/extraction/passes/B_index/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/C_sequences/prompt.txt b/extraction/passes/C_sequences/prompt.txt index f7b8c17..4d8b399 100644 --- a/extraction/passes/C_sequences/prompt.txt +++ b/extraction/passes/C_sequences/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/D_parameters/prompt.txt b/extraction/passes/D_parameters/prompt.txt index abbf733..4f152e9 100644 --- a/extraction/passes/D_parameters/prompt.txt +++ b/extraction/passes/D_parameters/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/E_outcomes/prompt.txt b/extraction/passes/E_outcomes/prompt.txt index 5c0218c..c45dee6 100644 --- a/extraction/passes/E_outcomes/prompt.txt +++ b/extraction/passes/E_outcomes/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/F_pairings/prompt.txt b/extraction/passes/F_pairings/prompt.txt index a82c411..fa79127 100644 --- a/extraction/passes/F_pairings/prompt.txt +++ b/extraction/passes/F_pairings/prompt.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks: diff --git a/extraction/passes/_1_SeqPrompt/prompt.txt b/extraction/passes/_1_SeqPrompt/prompt.txt new file mode 100644 index 0000000..96a4d96 --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/prompt.txt @@ -0,0 +1,37 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following task: +* Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 0, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_1_SeqPrompt/schema.json b/extraction/passes/_1_SeqPrompt/schema.json new file mode 100644 index 0000000..1f3b98c --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/schema.json @@ -0,0 +1,14 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 0, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} \ No newline at end of file diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json new file mode 100644 index 0000000..455e1b8 --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 0, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/prompt.txt b/extraction/passes/_2_Experiments/prompt.txt new file mode 100644 index 0000000..65d39cd --- /dev/null +++ b/extraction/passes/_2_Experiments/prompt.txt @@ -0,0 +1,422 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". + +Perform the following task: +* Create a list of all hybridization experiments found in the article text and provide it in the form of JSON array, where each element is an object with the probe_sequence, target_sequence and parameters key. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "parameters": { + "type": "object", + "required": ["probe_type", "chemistry", "labeling", "targeting"], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "required": ["backbone"], + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": [ + "none", + "fluor_only", + "fluor_quencher", + "hapten", + "enzymatic", + "radioisotope" + ] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." + } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." + } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "biotin", + "digoxigenin", + "dinitrophenol", + "fluorescein_hapten" + ], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "HRP", + "AP" + ], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Isotope (e.g., 32P, 33P, 35S)." + } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": [ + "dna", + "rna", + "mrna", + "mirna", + "lncrna", + "rrna", + "genomic_dna", + "viral_rna", + "amplicon" + ] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": [ + "genomic", + "transcript", + "amplicon", + "in_situ", + "capture" + ] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": [ + "single", + "tiling_set", + "capture_baits", + "smfish_panel", + "merfish_panel", + "padlock_set" + ] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": [ + "none", + "hydrolysis", + "fret", + "hairpin_turn_on", + "rolling_circle", + "branched_dna", + "hcr" + ] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "qpcr", + "ddpcr", + "pcr_probe", + "fish", + "ish", + "smfish", + "merfish", + "ngs_capture", + "microarray", + "southern", + "northern", + "dot_blot", + "in_cell_imaging" + ], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "title": "DOI", + "description": "Digital Object Identifier of the source article.", + "type": "string", + "format": "iri" + }, + "pmid": { + "title": "PMID", + "description": "PubMed identifier.", + "type": "string" + }, + "vendor": { + "title": "Vendor", + "description": "Commercial supplier (if from a catalog).", + "type": "string" + }, + "catalog_number": { + "title": "Catalog Number", + "description": "Supplier's catalog identifier.", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don't fit other fields.", + "type": "string", + "examples": [ + "Probe includes internal ZEN quencher." + ] + } + } + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/schema.json b/extraction/passes/_2_Experiments/schema.json new file mode 100644 index 0000000..ca9b8ed --- /dev/null +++ b/extraction/passes/_2_Experiments/schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form." + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment." + }, + "parameters": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Briefly describe the laboratory parameters used for setting up for this hybridization experiment." + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} diff --git a/extraction/passes/_2_Experiments/schema_strict.json b/extraction/passes/_2_Experiments/schema_strict.json new file mode 100644 index 0000000..7650cba --- /dev/null +++ b/extraction/passes/_2_Experiments/schema_strict.json @@ -0,0 +1,398 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "parameters": { + "type": "object", + "required": ["probe_type", "chemistry", "labeling", "targeting"], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "required": ["backbone"], + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": [ + "none", + "fluor_only", + "fluor_quencher", + "hapten", + "enzymatic", + "radioisotope" + ] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." + } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." + } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "biotin", + "digoxigenin", + "dinitrophenol", + "fluorescein_hapten" + ], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "HRP", + "AP" + ], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Isotope (e.g., 32P, 33P, 35S)." + } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": [ + "dna", + "rna", + "mrna", + "mirna", + "lncrna", + "rrna", + "genomic_dna", + "viral_rna", + "amplicon" + ] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": [ + "genomic", + "transcript", + "amplicon", + "in_situ", + "capture" + ] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": [ + "single", + "tiling_set", + "capture_baits", + "smfish_panel", + "merfish_panel", + "padlock_set" + ] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": [ + "none", + "hydrolysis", + "fret", + "hairpin_turn_on", + "rolling_circle", + "branched_dna", + "hcr" + ] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "qpcr", + "ddpcr", + "pcr_probe", + "fish", + "ish", + "smfish", + "merfish", + "ngs_capture", + "microarray", + "southern", + "northern", + "dot_blot", + "in_cell_imaging" + ], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "title": "DOI", + "description": "Digital Object Identifier of the source article.", + "type": "string", + "format": "iri" + }, + "pmid": { + "title": "PMID", + "description": "PubMed identifier.", + "type": "string" + }, + "vendor": { + "title": "Vendor", + "description": "Commercial supplier (if from a catalog).", + "type": "string" + }, + "catalog_number": { + "title": "Catalog Number", + "description": "Supplier's catalog identifier.", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don't fit other fields.", + "type": "string", + "examples": [ + "Probe includes internal ZEN quencher." + ] + } + } + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} \ No newline at end of file diff --git a/extraction/passes/common.txt b/extraction/passes/common.txt index 3f4bd34..8aa9e17 100644 --- a/extraction/passes/common.txt +++ b/extraction/passes/common.txt @@ -1,13 +1,17 @@ -You are an information-extraction model. Output only a single JSON object that conforms to the provided JSON Schema. +You are an information-extraction model. +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. * Keep text exactly as in the article (no ellipses, no expansions). * Output all data fully, never skip or insert ellipses. * If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. -* Use the article’s wording for names. +* Use the article's wording for names. * Do not copy sequences from examples! * No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. * Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! Perform the following tasks for JSON extraction: diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py new file mode 100755 index 0000000..fbadc4c --- /dev/null +++ b/extraction/pipeline_pre_quest.py @@ -0,0 +1,803 @@ +# pipeline_filedriven.py +# -*- coding: utf-8 -*- +""" +File-driven multi-pass extractor with Outlines + Ollama. + +- Reads config, prompts, and schemas from disk (Git-friendly). +- Runs A..F passes (configurable) with Outlines JSON-guided generation. +- Saves raw text (*.txt), pretty JSON (*.json), and errors (*.log), never overwriting. +- Stitches pass outputs into a full object, validates against full schema (if provided), + and optionally inserts into SQLite via hyb_db.insert_article_object. + +Requirements: + pip install outlines ollama jsonschema tqdm + +Usage (script): + from pipeline_filedriven import run_project + run_project("your_project_dir") + +The project_dir must contain (by default): + config/pipeline.json + passes//{schema.json,prompt.txt} + schemas/full.json + inputs/*.txt +""" + +import json +import logging +import re +import os, sys +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import ollama +import outlines +from jsonschema import Draft202012Validator +from outlines.types import JsonSchema +from tqdm import tqdm + +API_TOKEN = os.getenv("OPEN_BUTTON_TOKEN", None) + + +# ────────────────────────────────────────────────────────────────────── +# Config models +# ────────────────────────────────────────────────────────────────────── + +@dataclass +class PassConfig: + """Single extraction pass config loaded from pipeline.json.""" + name: str # e.g., "A_core" + schema_path: Path # path to JSON Schema file + prompt_path: Path # path to the prompt .txt file + timeout: Optional[int] + +@dataclass +class PipelineConfig: + """Pipeline config loaded from config/pipeline.json.""" + model_names: List[str] + ollama_parameters: Dict[str, Any] + ollama_base_url: str + timeout_s: Optional[int] + input_dir: Path + out_dir: Path + full_schema_path: Optional[Path] + single_experiment_schema_path: Optional[Path] + db_path: Optional[Path] + article_glob: str + pre_passes: List[PassConfig] + passes: List[PassConfig] + + +def model_name_encode(model_name: str) -> str: + return model_name.replace("/", "_").replace("\\", "_").replace(":", "_") + +def load_pipeline_config(project_dir: Path) -> PipelineConfig: + """Load pipeline.json and construct a PipelineConfig. + + Expected JSON structure in config/pipeline.json: + { + "model_name": "myaniu/qwen2.5-1m:7b", + "num_ctx": 131072, + "num_predict": 65536, + "timeout_s": 1800, + "input_dir": "inputs", + "out_dir": "out", + "full_schema_path": "schemas/full.json", + "db_path": "out/massive.sqlite", // or null to skip DB + "article_glob": "*.txt", + "passes": [ + {"name": "A_core", "schema": "passes/A_core/schema.json", "prompt": "passes/A_core/prompt.txt"}, + {"name": "B_index", "schema": "passes/B_index/schema.json", "prompt": "passes/B_index/prompt.txt"}, + {"name": "C_sequences", "schema": "passes/C_sequences/schema.json", "prompt": "passes/C_sequences/prompt.txt"}, + {"name": "D_parameters","schema": "passes/D_parameters/schema.json","prompt": "passes/D_parameters/prompt.txt"}, + {"name": "E_outcomes", "schema": "passes/E_outcomes/schema.json", "prompt": "passes/E_outcomes/prompt.txt"}, + {"name": "F_pairings", "schema": "passes/F_pairings/schema.json", "prompt": "passes/F_pairings/prompt.txt"} + ] + } + """ + cfg_path = project_dir / "config" / "pipeline.json" + data = json.loads(cfg_path.read_text(encoding="utf-8")) + + def _opt_path(p) -> Optional[Path]: + return (project_dir / p) if p else None + + pre_passes: List[PassConfig] = [] + for p in data["pre_passes"]: + pre_passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None) + ) + ) + + passes: List[PassConfig] = [] + for p in data["passes"]: + passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None) + ) + ) + + return PipelineConfig( + model_names=list(data.get("model_names", [])), + ollama_parameters=dict(data.get("ollama_parameters", {})), + ollama_base_url=str(data.get("ollama_base_url", None)), + timeout_s=int(data.get("timeout_s", None)), + input_dir=project_dir / data.get("input_dir", "inputs"), + out_dir=project_dir / data.get("out_dir", "out"), + full_schema_path=_opt_path(data.get("full_schema_path")), + single_experiment_schema_path=_opt_path(data.get("single_experiment_schema_path")), + db_path=_opt_path(data.get("db_path")), + article_glob=data.get("article_glob", "*.txt"), + pre_passes=pre_passes, + passes=passes, + ) + + +# ────────────────────────────────────────────────────────────────────── +# Logging +# ────────────────────────────────────────────────────────────────────── + +def _make_logger(log_dir: Path) -> logging.Logger: + log_dir.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger("pipeline_filedriven") + logger.setLevel(logging.INFO) + logger.handlers.clear() + + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) + logger.addHandler(ch) + + fh = logging.FileHandler(log_dir / "pipeline.log", encoding="utf-8") + fh.setLevel(logging.INFO) + fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) + logger.addHandler(fh) + return logger + + +# ────────────────────────────────────────────────────────────────────── +# Tools (Ollama helpers) — Google-style docstrings +# ────────────────────────────────────────────────────────────────────── + +def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: + """Convert a numeric value and unit to SI. + + Supports temperature and common concentrations. + + Args: + value: Parsed numeric value or None. + unit: Unit string as written (e.g., '°C', 'mM', 'µM', 'nM', '%', 'K'). + + Returns: + A pair (si_value, si_unit), or (None, None) if unknown. + """ + if value is None or unit is None: + return None, None + u = unit.strip().lower().replace("µ", "u") + if u in {"c", "°c", "deg c", "celsius"}: + return value + 273.15, "K" + if u in {"k", "kelvin"}: + return value, "K" + if u in {"m", "mol/l"}: + return value * 1000.0, "mol/m^3" + if u in {"mm", "mmol/l", "mmol", "mm"}: + return value * 1.0, "mol/m^3" + if u in {"um", "umol/l", "µm", "µmol/l", "micromolar"}: + return value * 1e-3, "mol/m^3" + if u in {"nm", "nmol/l", "nanomolar"}: + return value * 1e-6, "mol/m^3" + if u in {"%", "percent", "perc"}: + return value / 100.0, "dimensionless" + return None, None + + +OLIGO_RE = re.compile( + r"^\s*(?:(?P(?:5|3)(?:['′’]|0|O)?)\s*-\s*)?(?:(?P(?:[A-Za-z0-9+]+-)+))?" + r"(?P[ACGUTRYSWKMBDHVN]+)(?:(?P(?:-[A-Za-z0-9+]+)+))?" + r"(?:\s*\(\s*(?P\d+)\s*(?:b|bp)\s*\)\s*)?\s*$", + re.X, +) + +def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: + """Parse a decorated oligo string into structured parts. + + Args: + raw: The exact oligo string from the article (may include labels and length). + + Returns: + A dict with keys: raw, sequence, length_bases, prime_prefix, + five_prime_label, three_prime_label, labels, sense_antisense (None). + """ + result = { + "raw": raw, + "sequence": None, + "length_bases": None, + "prime_prefix": None, + "five_prime_label": None, + "three_prime_label": None, + "labels": [], + "sense_antisense": None, + } + if not raw: + return result + m = OLIGO_RE.match(raw) + if not m: + return result + prime = m.group("prime") + if prime: + result["prime_prefix"] = 5 if prime.startswith("5") else 3 + seq = m.group("seq") + if seq: + result["sequence"] = seq.upper() + if m.group("len"): + result["length_bases"] = int(m.group("len")) + labels: List[str] = [] + if m.group("prefix"): + labels += [x for x in m.group("prefix").split("-") if x] + if m.group("suffix"): + labels += [x for x in m.group("suffix").split("-") if x] + result["labels"] = labels + if labels: + result["five_prime_label"] = labels[0] + result["three_prime_label"] = labels[-1] + return result + + +def make_measurement(raw: Optional[str], value: Optional[float] = None, unit: Optional[str] = None) -> Dict[str, Any]: + """Build a 'measurement' object with SI conversion. + + Args: + raw: Raw textual measurement (e.g., '58 °C', '2 mM', '10%'). + value: Parsed numeric value, if available. + unit: Unit string as written. + + Returns: + A dict with keys: raw, value, unit, si_value, si_unit, assumptions (None). + """ + si_value, si_unit = to_si(value, unit) if (value is not None and unit is not None) else (None, None) + return { + "raw": raw or "", + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None, + } + + +# ────────────────────────────────────────────────────────────────────── +# JSON helpers +# ────────────────────────────────────────────────────────────────────── + +def repair_json(text: str) -> str: + """Best-effort JSON repair for streamed outputs.""" + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + return text + candidate = text[start : end + 1] + try: + json.loads(candidate) + return candidate + except Exception: + candidate = re.sub(r",\s*([}\]])", r"\1", candidate) + json.loads(candidate) + return candidate + + +# ────────────────────────────────────────────────────────────────────── +# Outlines runner +# ────────────────────────────────────────────────────────────────────── + +def _now_stamp() -> str: + return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") + + +def run_single_pass( + model: Any, + article_text: str, + pass_cfg: PassConfig, + out_base: Path, + article_stem: str, + tools: List[Any], + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, +) -> Dict[str, Any]: + """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + js = JsonSchema(pass_cfg.schema_path.read_text(encoding="utf-8")) + validator = Draft202012Validator(json.loads(js.schema)) + prompt = pass_cfg.prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = txt_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" + json_out_path = json_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" + err_log_path = log_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + + logger.info(f"[{pass_cfg.name}:{model_name}] generating …") + response = "" + try: + # for chunk in model.stream( + # prompt + "\n\n" + article_text, + # output_type=js, + # options=ollama_parameters, + # tools=tools, + # ): + # response += chunk + response = model.generate( + prompt + "\n" + "And here is the article text you must base your answer on:\n\n
\n" + article_text + "\n<\\article>\n", + output_type=js, + options=ollama_parameters, + #tools=tools, # TODO: Temporarily switch tools off + ) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + raw_txt_path.write_text(response, encoding="utf-8") + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] JSON parse error") + err_log_path.write_text(f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8") + raise + + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + msg = "\n".join(str(e) for e in errors) + logger.error(f"[{pass_cfg.name}:{model_name}] validation errors:\n{msg}") + err_log_path.write_text(f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", encoding="utf-8") + else: + logger.info(f"[{pass_cfg.name}:{model_name}] validation OK") + logger.info(f"[{pass_cfg.name}] validation OK [{model_name}]") + + json_out_path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8") + return obj + + +# ────────────────────────────────────────────────────────────────────── +# Stitcher (to your full object) +# ────────────────────────────────────────────────────────────────────── + +def _merge_reports(*reports: Optional[Dict[str, Any]]) -> Dict[str, Any]: + out = {"missing": [], "uncertain": [], "notes": None} + notes = [] + for r in reports: + if not r: + continue + out["missing"].extend(r.get("missing") or []) + out["uncertain"].extend(r.get("uncertain") or []) + if r.get("notes"): + notes.append(str(r["notes"])) + out["missing"] = list(dict.fromkeys(out["missing"])) + out["uncertain"] = list(dict.fromkeys(out["uncertain"])) + out["notes"] = " | ".join(notes) if notes else None + return out + + +def _to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: + return to_si(value, unit) + + +def _to_measurement_full(m_lite: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + if not m_lite: + return None + raw = m_lite.get("raw") or "" + value = m_lite.get("value") + unit = m_lite.get("unit") + si_value, si_unit = _to_si(value, unit) if (value is not None and unit is not None) else (None, None) + return { + "raw": raw, + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None, + "provenance": { + "source_type": "unknown", + "page": None, + "section": None, + "quote": None, + "notes": None, + }, + } + + +def _detect_sa_from_name(name: Optional[str]) -> Optional[str]: + if not name: + return None + n = name.strip().lower() + if n.endswith(")as"): + return "antisense" + if n.endswith(")s"): + return "sense" + return None + + +def _coerce_sa(value: Optional[str], name: Optional[str]) -> Optional[str]: + m = {"s": "sense", "as": "antisense", "sense": "sense", "antisense": "antisense", "+": "sense", "-": "antisense"} + if value is None or (isinstance(value, str) and not value.strip()): + return _detect_sa_from_name(name) + return m.get(str(value).strip().lower(), _detect_sa_from_name(name)) + + +def _to_oligo_full(ol: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + if not ol: + return None + return { + "raw": ol.get("raw") or "", + "sequence": ol.get("sequence"), + "length_bases": ol.get("length_bases"), + "prime_prefix": None, + "five_prime_label": ol.get("five_prime_label"), + "three_prime_label": ol.get("three_prime_label"), + "labels": [], + "sense_antisense": ol.get("sense_antisense"), + "provenance": { + "source_type": "unknown", + "page": None, + "section": None, + "quote": None, + "notes": None, + }, + } + + +def stitch_full( + A_core: Dict[str, Any], + B_index: Dict[str, Any], + C_sequences: Dict[str, Any], + D_parameters: Dict[str, Any], + E_outcomes: Dict[str, Any], + F_pairings: Dict[str, Any], +) -> Dict[str, Any]: + core = {"doi": A_core.get("doi"), "abstract": A_core.get("abstract"), "topic": A_core.get("topic")} + E: Dict[str, Dict[str, Any]] = {} + for e in (B_index.get("experiments") or []): + E[e["id_exp"]] = { + "id_exp": e["id_exp"], + "raw_description": e.get("raw_description"), + "type": e.get("type"), + "description": e.get("description"), + "metadata": {}, + "sequences": {}, + "experiment_properties": {}, + "outcome": {}, + "pairing": {}, + "extraction_report": {"missing": [], "uncertain": [], "notes": None}, + } + + for item in (C_sequences.get("items") or []): + ie = item["id_exp"] + if ie not in E: + continue + prb = item.get("probe") or {} + seqs = {} + seqs["probe"] = { + "name": prb.get("name"), + "amplicon_id": prb.get("amplicon_id"), + "oligo": _to_oligo_full(prb.get("oligo")), + "fluorophore": prb.get("fluorophore"), + "quencher": prb.get("quencher"), + "sense_antisense": _coerce_sa(prb.get("sense_antisense"), prb.get("name")), + "notes": prb.get("notes"), + } + tgt = item.get("target_sequence") + seqs["target_sequence"] = _to_oligo_full(tgt) if tgt is not None else None + pr = item.get("primer_sequences") + if isinstance(pr, dict): + seqs["primer_sequences"] = {"forward": _to_oligo_full(pr.get("forward")), "reverse": _to_oligo_full(pr.get("reverse"))} + else: + seqs["primer_sequences"] = None + rels = [] + for rs in (item.get("related_sequences") or []): + rels.append({"related_sequence": _to_oligo_full(rs.get("related_sequence")), "description": rs.get("description")}) + seqs["related_sequences"] = rels + E[ie]["sequences"] = seqs + + for item in (D_parameters.get("items") or []): + ie = item["id_exp"] + if ie not in E: + continue + MD: Dict[str, Any] = {} + _md = item.get("metadata") or {} + MD["organism"] = _md.get("organism") + MD["technology"] = _md.get("technology") + ann = _md.get("annealing") + if ann is None: + MD["annealing"] = None + elif isinstance(ann, dict): + MD["annealing"] = { + "quantitative": _to_measurement_full(ann.get("quantitative")), + "qualitative": ann.get("qualitative"), + } + else: + MD["annealing"] = None + MD["pH"] = _to_measurement_full(_md.get("pH")) + ri = _md.get("rna_impurities") + if ri is None: + MD["rna_impurities"] = None + elif isinstance(ri, dict): + MD["rna_impurities"] = { + "quantitative": _to_measurement_full(ri.get("quantitative")), + "qualitative": ri.get("qualitative"), + } + else: + MD["rna_impurities"] = None + + EP: Dict[str, Any] = {} + concs = (item.get("experiment_properties") or {}).get("concentrations") or {} + EP["concentrations"] = { + "dna_rna_concentration": _to_measurement_full(concs.get("dna_rna_concentration")), + "concentration_SI": _to_measurement_full(concs.get("concentration_SI")), + } + pars = (item.get("experiment_properties") or {}).get("parameters_SI") or {} + EP["parameters_SI"] = { + "temperature": _to_measurement_full(pars.get("temperature")), + "Tris": _to_measurement_full(pars.get("Tris")), + "Na": _to_measurement_full(pars.get("Na")), + "K": _to_measurement_full(pars.get("K")), + "Mg": _to_measurement_full(pars.get("Mg")), + "DMSO": _to_measurement_full(pars.get("DMSO")), + } + E[ie]["metadata"] = MD + E[ie]["experiment_properties"] = EP + + for item in (E_outcomes.get("items") or []): + ie = item["id_exp"] + if ie not in E: + continue + E[ie]["outcome"] = { + "outcome": item.get("outcome"), + "fluorescence": _to_measurement_full(item.get("fluorescence")), + "comparative_notes": item.get("comparative_notes"), + } + + for item in (F_pairings.get("items") or []): + ie = item["id_exp"] + if ie not in E: + continue + E[ie]["pairing"] = { + "paired_with_probe_name": item.get("paired_with_probe_name"), + "relationship": item.get("relationship"), + } + + full_report = _merge_reports( + A_core.get("extraction_report"), + B_index.get("extraction_report"), + C_sequences.get("extraction_report"), + D_parameters.get("extraction_report"), + E_outcomes.get("extraction_report"), + F_pairings.get("extraction_report"), + ) + + return { + "doi": core["doi"], + "abstract": core["abstract"], + "topic": core["topic"], + "experiments": list(E.values()), + "extraction_report": full_report, + } + +def _deep_merge_keep_left(a, b): + """Shallow-friendly deep merge: keep a's non-null scalars; use b if a is None. + - Dicts: recurse. + - Lists: concatenate (no dedup). + - Scalars: prefer a unless a is None/empty, then b. + """ + if a is None: + return b + if b is None: + return a + if isinstance(a, dict) and isinstance(b, dict): + out = dict(a) + for k, bv in b.items(): + av = out.get(k) + out[k] = _deep_merge_keep_left(av, bv) if k in out else bv + return out + if isinstance(a, list) and isinstance(b, list): + return a + b + # prefer a unless it's falsy and b is truthy + return a if a not in (None, "", []) else b + + +def aggregate_c_outputs(outputs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: + """Build a consolidated C_sequences object from any of: C_sequences, C1_probe_core, C2_target_primers, C3_related.""" + # Start with single-pass C if present + base = outputs.get("C_sequences") or {"items": [], "extraction_report": {"missing": [], "uncertain": [], "notes": None}} + + # Build item index by id_exp from base + items_map: Dict[str, Dict[str, Any]] = {} + for it in base.get("items", []): + if not isinstance(it, dict) or "id_exp" not in it: + continue + items_map[it["id_exp"]] = dict(it) + + def _merge_from(pass_name: str, fields: List[str]): + obj = outputs.get(pass_name) + if not isinstance(obj, dict): + return + for it in obj.get("items", []): + if not isinstance(it, dict) or "id_exp" not in it: + continue + ie = it["id_exp"] + tgt = items_map.setdefault(ie, {"id_exp": ie}) + for f in fields: + if f in it: + tgt[f] = _deep_merge_keep_left(tgt.get(f), it[f]) + + # merge extraction report + br = base.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None} + er = obj.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None} + br["missing"] = list((br.get("missing") or []) + (er.get("missing") or [])) + br["uncertain"] = list((br.get("uncertain") or []) + (er.get("uncertain") or [])) + br_notes = [n for n in [br.get("notes"), er.get("notes")] if n] + br["notes"] = " | ".join(br_notes) if br_notes else None + base["extraction_report"] = br + + # Merge micro-passes over base (C1, C2, C3). The field names match your schemas. + _merge_from("C1_probe_core", ["probe"]) + _merge_from("C2_target_primers", ["target_sequence", "primer_sequences"]) + _merge_from("C3_related", ["related_sequences"]) + + # Produce consolidated list + merged_items = list(items_map.values()) + # Normalize: ensure all top-level keys exist for stitch_full + for it in merged_items: + it.setdefault("probe", None) + it.setdefault("target_sequence", None) + it.setdefault("primer_sequences", None) + it.setdefault("related_sequences", []) + + return {"items": merged_items, "extraction_report": base.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None}} + + +# ────────────────────────────────────────────────────────────────────── +# Project runner +# ────────────────────────────────────────────────────────────────────── + +def run_project(project_dir: str | Path) -> None: + """Run the pipeline as configured by files under project_dir.""" + project_dir = Path(project_dir) + cfg = load_pipeline_config(project_dir) + + out_base = cfg.out_dir + out_base.mkdir(parents=True, exist_ok=True) + logger = _make_logger(out_base / "logs") + + headers = dict() + if API_TOKEN is not None: + headers['Authorization'] = f'Bearer {API_TOKEN}' + + # Ollama client + Outlines model + client = ollama.Client(host=cfg.ollama_base_url, timeout=cfg.timeout_s, headers=headers) + + for model_name in cfg.model_names: + model = outlines.from_ollama(client, model_name) + tools = [to_si, parse_oligo, make_measurement] + + # Optional full-schema validator + full_validator = None + if cfg.full_schema_path and cfg.full_schema_path.exists(): + try: + full_schema_text = cfg.full_schema_path.read_text(encoding="utf-8") + full_validator = Draft202012Validator(json.loads(full_schema_text)) + logger.info("Loaded full schema for final validation.") + except Exception: + logger.exception("Failed to load/parse full schema; proceeding without final validation.") + + logger.info(f"Article glob: {cfg.article_glob}") + + # Iterate input articles + files = sorted(cfg.input_dir.glob(cfg.article_glob)) + logger.info(f"Files: {files}") + + for art_path in tqdm(files, desc="Articles"): + article_name = art_path.stem + logger.info(f"=== {article_name} : {model_name} ===") + article_text = art_path.read_text(encoding="utf-8") + + # Run configured pre-passes + outputs: Dict[str, Dict[str, Any]] = {} + for p in tqdm(cfg.pre_passes, desc=f"{article_name} pre-passes", leave=False): + try: + outputs[p.name] = run_single_pass( + model=model, + article_text=article_text, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) + except Exception: + logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") + + all_found_sequences = ", ".join(outputs["SeqPrompt_strict"]) + logger.info("Pre-passes done, found sequences: " + all_found_sequences) + + # for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False): + # try: + # outputs[p.name] = run_single_pass( + # model=model, + # article_text=article_text, + # pass_cfg=p, + # out_base=out_base, + # article_stem=article_name, + # tools=tools, + # logger=logger, + # ollama_parameters=cfg.ollama_parameters, + # model_name=model_name, + # ) + # except Exception: + # logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") + + # # Stitch only if the expected pass names are present + # try: + # A = outputs.get("A_core", {}) + # B = outputs.get("B_index", {}) + # # C = outputs.get("C_sequences", {}) + # C = aggregate_c_outputs(outputs) + # D = outputs.get("D_parameters", {}) + # E = outputs.get("E_outcomes", {}) + # F = outputs.get("F_pairings", {}) + # full_obj = stitch_full(A, B, C, D, E, F) + + # # Final validation + # if full_validator: + # errs = sorted(full_validator.iter_errors(full_obj), key=lambda e: e.path) + # if errs: + # logger.error(f"[FULL] validation errors for {article_name} : {model_name}:\n" + "\n".join(str(e) for e in errs)) + # else: + # logger.info(f"[FULL] validation OK for {article_name} : {model_name}") + + # # Save full object (timestamped) + # stamp = _now_stamp() + # full_dir = out_base / "json_full" + # full_dir.mkdir(parents=True, exist_ok=True) + # full_path = full_dir / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" + # full_path.write_text(json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8") + # logger.info(f"[FULL] wrote {full_path.name} {article_name} : {model_name}") + + # # Optional DB insert + # if cfg.db_path: + # try: + # from hyb_db import insert_article_object # your earlier module + # run_id = insert_article_object( + # db_path=str(cfg.db_path), + # article_obj=full_obj, + # model_name=model_name, + # article_name=article_name, + # ) + # logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") + # except Exception: + # logger.exception("[DB] insertion failed") + # except Exception: + # logger.exception(f"[FULL] stitching failed for {article_name} : {model_name}") + + +# Optional CLI hook (project_dir arg) +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python pipeline_filedriven.py ") + sys.exit(1) + run_project(sys.argv[1]) diff --git a/extraction/schemas/single_experiment.json b/extraction/schemas/single_experiment.json new file mode 100644 index 0000000..20e24b6 --- /dev/null +++ b/extraction/schemas/single_experiment.json @@ -0,0 +1,798 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": [ + "missing", + "uncertain", + "notes" + ], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000 + }, + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "required": [ + "source_type", + "page", + "section", + "quote", + "notes" + ], + "properties": { + "source_type": { + "type": "string", + "enum": [ + "pdf", + "html", + "other", + "unknown" + ], + "description": "Type of source the extractor processed." + }, + "page": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": [ + "string", + "null" + ], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": [ + "string", + "null" + ], + "description": "Short verbatim snippet that directly supports the value." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit", + "si_value", + "si_unit", + "assumptions", + "provenance" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": [ + "string", + "null" + ], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": [ + "number", + "null" + ], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": [ + "string", + "null" + ], + "enum": [ + "K", + "mol/m^3", + "Pa", + "kg/m^3", + "s", + "dimensionless" + ], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": [ + "string", + "null" + ], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "labels", + "sense_antisense", + "provenance" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, here is its description: (.*))$" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": [ + "integer", + "null" + ], + "enum": [ + 3, + 5, + null + ], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { + "type": "string" + } + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": [ + "name", + "oligo", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes" + ], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": "string", + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": "string", + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + "properties": { + "title": "Article with experiments/probes", + "type": "object", + "additionalProperties": false, + "required": [ + "doi", + "abstract", + "topic", + "experiments", + "extraction_report" + ], + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "experiment": { + "description": "Full description of a single hybridization experiment instance related to this sequence", + "type": "object", + "additionalProperties": false, + "required": [ + "id_exp", + "raw_description", + "experiment_type", + "description", + "metadata", + "sequences", + "experiment_properties", + "outcome", + "pairing", + "extraction_report" + ], + "properties": { + "id_exp": { + "type": "string", + "minLength": 1, + "maxLength": 120, + "description": "Unique experiment identifier (derive if needed from amplicon + probe name')." + }, + "raw_description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "experiment_type": { + "type": "object", + "description": "Description of this single hybridization experiment design.", + "additionalProperties": false, + "required": ["probe_type", "chemistry"], + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + } + }, + "description": { + "type": "string", + "minLength": 10, + "maxLength": 1000, + "description": "Concise human-readable summary of this specific target-probe experiment." + }, + "metadata": { + "type": "object", + "additionalProperties": false, + "description": "High-level descriptors linked to this experiment.", + "required": [ + "organism", + "technology", + "annealing", + "pH", + "rna_impurities" + ], + "properties": { + "organism": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Organism (e.g., 'human')." + }, + "technology": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." + }, + "annealing": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "Annealing process details, with optional quantitative and qualitative components.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." + }, + "qualitative": { + "type": [ + "boolean", + "null" + ], + "description": "If the article states a qualitative annealing outcome/criterion." + } + } + }, + "pH": { + "$ref": "#/$defs/measurement", + "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." + }, + "rna_impurities": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "RNA impurity information, if discussed.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Quantity/percentage of RNA impurities." + }, + "qualitative": { + "type": [ + "boolean", + "null" + ], + "description": "Presence/absence or a qualitative statement regarding RNA impurities." + } + } + } + } + }, + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": [ + "target_sequence", + "probe", + "primer_sequences", + "related_sequences" + ], + "properties": { + "target_sequence": { + "oneOf": [ + { + "$ref": "#/$defs/decoratedOligo" + }, + { + "type": "string", + "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", + "minLength": 70, + "maxLength": 200 + } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { + "$ref": "#/$defs/primerPair" + }, + { + "type": "null" + } + ], + "description": "PCR primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": [ + "concentrations", + "parameters_SI" + ], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": [ + "dna_rna_concentration", + "concentration_SI" + ], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": [ + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": [ + "outcome", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + }, + "pairing": { + "type": "object", + "additionalProperties": false, + "description": "Optional cross-references to paired/reciprocal probes within the same article.", + "required": [ + "paired_with_probe_name", + "relationship" + ], + "properties": { + "paired_with_probe_name": { + "type": [ + "string", + "null" + ], + "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." + }, + "relationship": { + "type": [ + "string", + "null" + ], + "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + } + } +} \ No newline at end of file From 1c2aca3dbe3ead14087275ec6dc9a14f694478b6 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 04:32:49 +0400 Subject: [PATCH 034/102] Add models, enforce schema --- extraction/config/pipeline.json | 4 +++- extraction/passes/_1_SeqPrompt/schema.json | 2 +- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index b69055b..8910faa 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,9 +1,11 @@ { "model_names": [ "deepseek-r1:7b-qwen-distill-q4_K_M", + "myaniu/qwen2.5-1m:7b", "qwen2.5-coder:3b", "llama3.1:latest", - "qwen3:4b" + "qwen3:4b", + "myaniu/qwen2.5-1m:14b" ], "ollama_parameters": { "num_ctx": 131072, diff --git a/extraction/passes/_1_SeqPrompt/schema.json b/extraction/passes/_1_SeqPrompt/schema.json index 1f3b98c..f0ffc9b 100644 --- a/extraction/passes/_1_SeqPrompt/schema.json +++ b/extraction/passes/_1_SeqPrompt/schema.json @@ -3,7 +3,7 @@ "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 0, + "minItems": 1, "maxItems": 1000, "items": { "type": "string", diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index 455e1b8..570e432 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -3,7 +3,7 @@ "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 0, + "minItems": 1, "maxItems": 1000, "items": { "type": "string", From 5f3c3f39216955c6b7c3ee20d7313af4e78d7630 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:02:20 +0400 Subject: [PATCH 035/102] Try implement pre-step --- extraction/config/pipeline.json | 11 +- extraction/passes/_1_SeqPrompt/prompt.txt | 3 +- .../passes/_1_SeqPrompt/prompt_strict.txt | 37 ++ extraction/passes/_2_Experiments/prompt.txt | 374 +--------------- .../passes/_2_Experiments/prompt_strict.txt | 422 ++++++++++++++++++ .../_3_ConstructSingleExperiment/prompt.txt | 18 + .../_3_ConstructSingleExperiment/schema.json} | 0 extraction/pipeline_pre_quest.py | 303 ++++++++++--- 8 files changed, 746 insertions(+), 422 deletions(-) create mode 100644 extraction/passes/_1_SeqPrompt/prompt_strict.txt create mode 100644 extraction/passes/_2_Experiments/prompt_strict.txt create mode 100644 extraction/passes/_3_ConstructSingleExperiment/prompt.txt rename extraction/{schemas/single_experiment.json => passes/_3_ConstructSingleExperiment/schema.json} (100%) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 8910faa..b519f5d 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -18,7 +18,6 @@ "input_dir": "input/md", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", - "single_experiment_schema_path": "schema/json/single_experiment.json", "db_path": "outlines_output/massive.sqlite", "article_glob": "**/*.md", "pre_passes": [ @@ -31,7 +30,7 @@ { "name": "SeqPrompt_strict", "schema": "passes/_1_SeqPrompt/schema_strict.json", - "prompt": "passes/_1_SeqPrompt/prompt.txt", + "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", "timeout": 60 }, { @@ -43,10 +42,16 @@ { "name": "Experiments-strict", "schema": "passes/_2_Experiments/schema_strict.json", - "prompt": "passes/_2_Experiments/prompt.txt", + "prompt": "passes/_2_Experiments/prompt_strict.txt", "timeout": 60 } ], + "construct_single_experiment_pass": { + "name": "ConstructSingleExperiment", + "schema": "passes/_3_ConstructSingleExperiment/schema.json", + "prompt": "passes/_3_ConstructSingleExperiment/prompt.txt", + "timeout": 60 + }, "passes": [ { "name": "A_core", diff --git a/extraction/passes/_1_SeqPrompt/prompt.txt b/extraction/passes/_1_SeqPrompt/prompt.txt index 96a4d96..eb602b3 100644 --- a/extraction/passes/_1_SeqPrompt/prompt.txt +++ b/extraction/passes/_1_SeqPrompt/prompt.txt @@ -24,13 +24,12 @@ Here is the JSON schema you have to follow: "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 0, + "minItems": 1, "maxItems": 1000, "items": { "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt new file mode 100644 index 0000000..96a4d96 --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -0,0 +1,37 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following task: +* Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 0, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/prompt.txt b/extraction/passes/_2_Experiments/prompt.txt index 65d39cd..9c26d9f 100644 --- a/extraction/passes/_2_Experiments/prompt.txt +++ b/extraction/passes/_2_Experiments/prompt.txt @@ -35,380 +35,20 @@ Here is the JSON schema you have to follow: "target": { "type": "string", "minLength": 5, - "maxLength": 150, - "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", - "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + "maxLength": 100, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form." }, "probe": { "type": "string", "minLength": 5, "maxLength": 100, - "description": "The hybridization probe in this instance of hybridization experiment.", - "pattern": "^[ACGUTRYSWKMBDHVN]+$" + "description": "The hybridization probe in this instance of hybridization experiment." }, "parameters": { - "type": "object", - "required": ["probe_type", "chemistry", "labeling", "targeting"], - "properties": { - "probe_type": { - "title": "Probe Type", - "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", - "oneOf": [ - { - "const": "linear", - "title": "Linear", - "description": "Simple oligo that hybridizes without structural activation; often end-labeled." - }, - { - "const": "molecular_beacon", - "title": "Molecular beacon", - "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." - }, - { - "const": "hydrolysis_taqman", - "title": "Hydrolysis (TaqMan)", - "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." - }, - { - "const": "fret_dual_hybridization", - "title": "FRET dual-hybridization", - "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." - }, - { - "const": "scorpion", - "title": "Scorpion", - "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." - }, - { - "const": "hcr", - "title": "Hybridization Chain Reaction (HCR)", - "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." - }, - { - "const": "branched_dna", - "title": "Branched DNA (bDNA)", - "description": "Signal amplification via multibranch DNA scaffolds without target amplification." - }, - { - "const": "padlock", - "title": "Padlock", - "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." - }, - { - "const": "capture", - "title": "Capture", - "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." - }, - { - "const": "tiling_set", - "title": "Tiling set", - "description": "Multiple overlapping probes across a region/gene for robust detection." - }, - { - "const": "antisense", - "title": "Antisense", - "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." - } - ] - }, - "chemistry": { - "title": "Chemistry", - "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", - "type": "object", - "additionalProperties": false, - "required": ["backbone"], - "properties": { - "backbone": { - "title": "Backbone", - "description": "Primary nucleic-acid scaffold used by the probe.", - "oneOf": [ - { - "const": "dna", - "title": "DNA", - "description": "Unmodified DNA backbone." - }, - { - "const": "rna", - "title": "RNA", - "description": "Unmodified RNA backbone." - }, - { - "const": "cdna", - "title": "cDNA", - "description": "Complementary DNA derived from RNA." - }, - { - "const": "pna", - "title": "PNA", - "description": "Peptide nucleic acid backbone." - }, - { - "const": "morpholino", - "title": "Morpholino", - "description": "Morpholine-ring phosphorodiamidate backbone." - }, - { - "const": "lna_modified", - "title": "LNA-modified", - "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." - }, - { - "const": "two_ome_rna", - "title": "2'-O-Me RNA", - "description": "2'-O-methyl RNA backbone." - } - ] - }, - "modifications": { - "title": "Chemical Modifications", - "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": [ - "phosphorothioate", - "two_ome_spiked", - "lna_spiked", - "mgb", - "inverted_dT_3prime", - "amine_5prime", - "thiol_5prime", - "biotin_teg", - "spacer_18", - "cholesterol" - ], - "description": "Common modification keyword." - } - } - } - }, - "labeling": { - "title": "Labeling", - "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", - "type": "object", - "additionalProperties": false, - "properties": { - "strategy": { - "title": "Label Strategy", - "description": "High-level labeling approach; combine with concrete labels below as known.", - "type": "string", - "enum": [ - "none", - "fluor_only", - "fluor_quencher", - "hapten", - "enzymatic", - "radioisotope" - ] - }, - "reporters": { - "title": "Reporter Dyes", - "description": "Fluorophores or other reporters (free text to allow any brand/dye).", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." - } - }, - "quenchers": { - "title": "Quenchers", - "description": "Quenchers used in hydrolysis or hairpin probes.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." - } - }, - "haptens": { - "title": "Haptens", - "description": "Affinity tags detected by antibodies/streptavidin.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": [ - "biotin", - "digoxigenin", - "dinitrophenol", - "fluorescein_hapten" - ], - "description": "Common hapten tag." - } - }, - "enzymes": { - "title": "Enzyme Labels", - "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": [ - "HRP", - "AP" - ], - "description": "Common conjugated enzyme." - } - }, - "isotopes": { - "title": "Radioisotopes", - "description": "If radio-labeled, indicate isotope(s).", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "description": "Isotope (e.g., 32P, 33P, 35S)." - } - } - } - }, - "targeting": { - "title": "Targeting", - "description": "What the probe is intended to hybridize to, and in what context.", - "type": "object", - "additionalProperties": false, - "properties": { - "biomolecule": { - "title": "Biomolecule", - "description": "High-level target class.", - "type": "string", - "enum": [ - "dna", - "rna", - "mrna", - "mirna", - "lncrna", - "rrna", - "genomic_dna", - "viral_rna", - "amplicon" - ] - }, - "context": { - "title": "Context", - "description": "Assay/biological context for the target.", - "type": "string", - "enum": [ - "genomic", - "transcript", - "amplicon", - "in_situ", - "capture" - ] - }, - "target_name": { - "title": "Target Name", - "description": "Gene/transcript/locus identifier (free text).", - "type": "string" - } - } - }, - "set_design": { - "title": "Set / Panel Design", - "description": "Whether the probe is a single oligo or part of a designed set/panel.", - "type": "object", - "additionalProperties": false, - "properties": { - "mode": { - "title": "Set Mode", - "description": "Single probe or specific multi-probe design.", - "type": "string", - "enum": [ - "single", - "tiling_set", - "capture_baits", - "smfish_panel", - "merfish_panel", - "padlock_set" - ] - }, - "count": { - "title": "Probe Count", - "description": "Number of probes in the set/panel (if known).", - "type": "integer", - "minimum": 1 - } - } - }, - "amplification_mechanism": { - "title": "Amplification Mechanism", - "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", - "type": "string", - "enum": [ - "none", - "hydrolysis", - "fret", - "hairpin_turn_on", - "rolling_circle", - "branched_dna", - "hcr" - ] - }, - "application": { - "title": "Application", - "description": "Intended use(s) of the probe. Provide multiple if applicable.", - "type": "array", - "uniqueItems": true, - "items": { - "type": "string", - "enum": [ - "qpcr", - "ddpcr", - "pcr_probe", - "fish", - "ish", - "smfish", - "merfish", - "ngs_capture", - "microarray", - "southern", - "northern", - "dot_blot", - "in_cell_imaging" - ], - "description": "Common application keyword." - } - }, - "provenance": { - "title": "Provenance", - "description": "Source metadata for traceability.", - "type": "object", - "additionalProperties": false, - "properties": { - "doi": { - "title": "DOI", - "description": "Digital Object Identifier of the source article.", - "type": "string", - "format": "iri" - }, - "pmid": { - "title": "PMID", - "description": "PubMed identifier.", - "type": "string" - }, - "vendor": { - "title": "Vendor", - "description": "Commercial supplier (if from a catalog).", - "type": "string" - }, - "catalog_number": { - "title": "Catalog Number", - "description": "Supplier's catalog identifier.", - "type": "string" - } - } - }, - "notes": { - "title": "Notes", - "description": "Free-text comments or qualifiers that don't fit other fields.", - "type": "string", - "examples": [ - "Probe includes internal ZEN quencher." - ] - } - } + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Briefly describe the laboratory parameters used for setting up for this hybridization experiment." }, "hybridization_experiment_description": { "type": "string", diff --git a/extraction/passes/_2_Experiments/prompt_strict.txt b/extraction/passes/_2_Experiments/prompt_strict.txt new file mode 100644 index 0000000..65d39cd --- /dev/null +++ b/extraction/passes/_2_Experiments/prompt_strict.txt @@ -0,0 +1,422 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". + +Perform the following task: +* Create a list of all hybridization experiments found in the article text and provide it in the form of JSON array, where each element is an object with the probe_sequence, target_sequence and parameters key. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "parameters": { + "type": "object", + "required": ["probe_type", "chemistry", "labeling", "targeting"], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "required": ["backbone"], + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": [ + "none", + "fluor_only", + "fluor_quencher", + "hapten", + "enzymatic", + "radioisotope" + ] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." + } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." + } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "biotin", + "digoxigenin", + "dinitrophenol", + "fluorescein_hapten" + ], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "HRP", + "AP" + ], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Isotope (e.g., 32P, 33P, 35S)." + } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": [ + "dna", + "rna", + "mrna", + "mirna", + "lncrna", + "rrna", + "genomic_dna", + "viral_rna", + "amplicon" + ] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": [ + "genomic", + "transcript", + "amplicon", + "in_situ", + "capture" + ] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": [ + "single", + "tiling_set", + "capture_baits", + "smfish_panel", + "merfish_panel", + "padlock_set" + ] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": [ + "none", + "hydrolysis", + "fret", + "hairpin_turn_on", + "rolling_circle", + "branched_dna", + "hcr" + ] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "qpcr", + "ddpcr", + "pcr_probe", + "fish", + "ish", + "smfish", + "merfish", + "ngs_capture", + "microarray", + "southern", + "northern", + "dot_blot", + "in_cell_imaging" + ], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "title": "DOI", + "description": "Digital Object Identifier of the source article.", + "type": "string", + "format": "iri" + }, + "pmid": { + "title": "PMID", + "description": "PubMed identifier.", + "type": "string" + }, + "vendor": { + "title": "Vendor", + "description": "Commercial supplier (if from a catalog).", + "type": "string" + }, + "catalog_number": { + "title": "Catalog Number", + "description": "Supplier's catalog identifier.", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don't fit other fields.", + "type": "string", + "examples": [ + "Probe includes internal ZEN quencher." + ] + } + } + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_3_ConstructSingleExperiment/prompt.txt b/extraction/passes/_3_ConstructSingleExperiment/prompt.txt new file mode 100644 index 0000000..fa5d677 --- /dev/null +++ b/extraction/passes/_3_ConstructSingleExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/schemas/single_experiment.json b/extraction/passes/_3_ConstructSingleExperiment/schema.json similarity index 100% rename from extraction/schemas/single_experiment.json rename to extraction/passes/_3_ConstructSingleExperiment/schema.json diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index fbadc4c..eb67144 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -45,17 +45,21 @@ # Config models # ────────────────────────────────────────────────────────────────────── + @dataclass class PassConfig: """Single extraction pass config loaded from pipeline.json.""" - name: str # e.g., "A_core" - schema_path: Path # path to JSON Schema file - prompt_path: Path # path to the prompt .txt file + + name: str # e.g., "A_core" + schema_path: Path # path to JSON Schema file + prompt_path: Path # path to the prompt .txt file timeout: Optional[int] + @dataclass class PipelineConfig: """Pipeline config loaded from config/pipeline.json.""" + model_names: List[str] ollama_parameters: Dict[str, Any] ollama_base_url: str @@ -63,7 +67,7 @@ class PipelineConfig: input_dir: Path out_dir: Path full_schema_path: Optional[Path] - single_experiment_schema_path: Optional[Path] + construct_single_experiment_pass: Optional[PassConfig] db_path: Optional[Path] article_glob: str pre_passes: List[PassConfig] @@ -73,6 +77,7 @@ class PipelineConfig: def model_name_encode(model_name: str) -> str: return model_name.replace("/", "_").replace("\\", "_").replace(":", "_") + def load_pipeline_config(project_dir: Path) -> PipelineConfig: """Load pipeline.json and construct a PipelineConfig. @@ -104,15 +109,13 @@ def _opt_path(p) -> Optional[Path]: return (project_dir / p) if p else None pre_passes: List[PassConfig] = [] - for p in data["pre_passes"]: - pre_passes.append( - PassConfig( - name=p["name"], - schema_path=project_dir / p["schema"], - prompt_path=project_dir / p["prompt"], - timeout=p.get("timeout", None) - ) - ) + p = data["construct_single_experiment_pass"] + construct_single_experiment_pass = PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) passes: List[PassConfig] = [] for p in data["passes"]: @@ -121,7 +124,7 @@ def _opt_path(p) -> Optional[Path]: name=p["name"], schema_path=project_dir / p["schema"], prompt_path=project_dir / p["prompt"], - timeout=p.get("timeout", None) + timeout=p.get("timeout", None), ) ) @@ -133,7 +136,7 @@ def _opt_path(p) -> Optional[Path]: input_dir=project_dir / data.get("input_dir", "inputs"), out_dir=project_dir / data.get("out_dir", "out"), full_schema_path=_opt_path(data.get("full_schema_path")), - single_experiment_schema_path=_opt_path(data.get("single_experiment_schema_path")), + construct_single_experiment_pass=construct_single_experiment_pass, db_path=_opt_path(data.get("db_path")), article_glob=data.get("article_glob", "*.txt"), pre_passes=pre_passes, @@ -145,6 +148,7 @@ def _opt_path(p) -> Optional[Path]: # Logging # ────────────────────────────────────────────────────────────────────── + def _make_logger(log_dir: Path) -> logging.Logger: log_dir.mkdir(parents=True, exist_ok=True) logger = logging.getLogger("pipeline_filedriven") @@ -167,7 +171,10 @@ def _make_logger(log_dir: Path) -> logging.Logger: # Tools (Ollama helpers) — Google-style docstrings # ────────────────────────────────────────────────────────────────────── -def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: + +def to_si( + value: Optional[float], unit: Optional[str] +) -> Tuple[Optional[float], Optional[str]]: """Convert a numeric value and unit to SI. Supports temperature and common concentrations. @@ -206,6 +213,7 @@ def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], re.X, ) + def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: """Parse a decorated oligo string into structured parts. @@ -251,7 +259,9 @@ def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: return result -def make_measurement(raw: Optional[str], value: Optional[float] = None, unit: Optional[str] = None) -> Dict[str, Any]: +def make_measurement( + raw: Optional[str], value: Optional[float] = None, unit: Optional[str] = None +) -> Dict[str, Any]: """Build a 'measurement' object with SI conversion. Args: @@ -262,7 +272,9 @@ def make_measurement(raw: Optional[str], value: Optional[float] = None, unit: Op Returns: A dict with keys: raw, value, unit, si_value, si_unit, assumptions (None). """ - si_value, si_unit = to_si(value, unit) if (value is not None and unit is not None) else (None, None) + si_value, si_unit = ( + to_si(value, unit) if (value is not None and unit is not None) else (None, None) + ) return { "raw": raw or "", "value": value, @@ -277,6 +289,7 @@ def make_measurement(raw: Optional[str], value: Optional[float] = None, unit: Op # JSON helpers # ────────────────────────────────────────────────────────────────────── + def repair_json(text: str) -> str: """Best-effort JSON repair for streamed outputs.""" start = text.find("{") @@ -297,6 +310,7 @@ def repair_json(text: str) -> str: # Outlines runner # ────────────────────────────────────────────────────────────────────── + def _now_stamp() -> str: return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") @@ -324,9 +338,18 @@ def run_single_pass( prompt = pass_cfg.prompt_path.read_text(encoding="utf-8") stamp = _now_stamp() - raw_txt_path = txt_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" - json_out_path = json_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" - err_log_path = log_dir / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + ) logger.info(f"[{pass_cfg.name}:{model_name}] generating …") response = "" @@ -339,10 +362,16 @@ def run_single_pass( # ): # response += chunk response = model.generate( - prompt + "\n" + "And here is the article text you must base your answer on:\n\n
\n" + article_text + "\n<\\article>\n", + prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", output_type=js, options=ollama_parameters, - #tools=tools, # TODO: Temporarily switch tools off + # tools=tools, # TODO: Temporarily switch tools off + think=True, + keep_alive="30s", ) except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") @@ -356,19 +385,130 @@ def run_single_pass( obj = json.loads(fixed) except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] JSON parse error") - err_log_path.write_text(f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) raise errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) if errors: msg = "\n".join(str(e) for e in errors) logger.error(f"[{pass_cfg.name}:{model_name}] validation errors:\n{msg}") - err_log_path.write_text(f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", encoding="utf-8") + err_log_path.write_text( + f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", + encoding="utf-8", + ) else: logger.info(f"[{pass_cfg.name}:{model_name}] validation OK") logger.info(f"[{pass_cfg.name}] validation OK [{model_name}]") - json_out_path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8") + json_out_path.write_text( + json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) + return obj + + +def run_construct_single_experiment_pass( + model: Any, + article_text: str, + sequence: str, + pass_cfg: PassConfig, + out_base: Path, + article_stem: str, + tools: List[Any], + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, +) -> Dict[str, Any]: + """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + js = JsonSchema(pass_cfg.schema_path.read_text(encoding="utf-8")) + validator = Draft202012Validator(json.loads(js.schema)) + prompt = pass_cfg.prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_cfg.name}:{model_name}] generating …") + response = "" + try: + # for chunk in model.stream( + # prompt + "\n\n" + article_text, + # output_type=js, + # options=ollama_parameters, + # tools=tools, + # ): + # response += chunk + response = model.chat( + messages=[ + { + "role": "system", "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n" + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, + {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, + {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, + ], + output_type=js, + options=ollama_parameters, + think=True, + keep_alive="30s", + ) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + raw_txt_path.write_text(response, encoding="utf-8") + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] JSON parse error") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) + raise + + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + msg = "\n".join(str(e) for e in errors) + logger.error(f"[{pass_cfg.name}:{model_name}] validation errors:\n{msg}") + err_log_path.write_text( + f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", + encoding="utf-8", + ) + else: + logger.info(f"[{pass_cfg.name}:{model_name}] validation OK") + logger.info(f"[{pass_cfg.name}] validation OK [{model_name}]") + + json_out_path.write_text( + json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) return obj @@ -376,6 +516,7 @@ def run_single_pass( # Stitcher (to your full object) # ────────────────────────────────────────────────────────────────────── + def _merge_reports(*reports: Optional[Dict[str, Any]]) -> Dict[str, Any]: out = {"missing": [], "uncertain": [], "notes": None} notes = [] @@ -392,7 +533,9 @@ def _merge_reports(*reports: Optional[Dict[str, Any]]) -> Dict[str, Any]: return out -def _to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: +def _to_si( + value: Optional[float], unit: Optional[str] +) -> Tuple[Optional[float], Optional[str]]: return to_si(value, unit) @@ -402,7 +545,11 @@ def _to_measurement_full(m_lite: Optional[Dict[str, Any]]) -> Optional[Dict[str, raw = m_lite.get("raw") or "" value = m_lite.get("value") unit = m_lite.get("unit") - si_value, si_unit = _to_si(value, unit) if (value is not None and unit is not None) else (None, None) + si_value, si_unit = ( + _to_si(value, unit) + if (value is not None and unit is not None) + else (None, None) + ) return { "raw": raw, "value": value, @@ -432,7 +579,14 @@ def _detect_sa_from_name(name: Optional[str]) -> Optional[str]: def _coerce_sa(value: Optional[str], name: Optional[str]) -> Optional[str]: - m = {"s": "sense", "as": "antisense", "sense": "sense", "antisense": "antisense", "+": "sense", "-": "antisense"} + m = { + "s": "sense", + "as": "antisense", + "sense": "sense", + "antisense": "antisense", + "+": "sense", + "-": "antisense", + } if value is None or (isinstance(value, str) and not value.strip()): return _detect_sa_from_name(name) return m.get(str(value).strip().lower(), _detect_sa_from_name(name)) @@ -468,9 +622,13 @@ def stitch_full( E_outcomes: Dict[str, Any], F_pairings: Dict[str, Any], ) -> Dict[str, Any]: - core = {"doi": A_core.get("doi"), "abstract": A_core.get("abstract"), "topic": A_core.get("topic")} + core = { + "doi": A_core.get("doi"), + "abstract": A_core.get("abstract"), + "topic": A_core.get("topic"), + } E: Dict[str, Dict[str, Any]] = {} - for e in (B_index.get("experiments") or []): + for e in B_index.get("experiments") or []: E[e["id_exp"]] = { "id_exp": e["id_exp"], "raw_description": e.get("raw_description"), @@ -484,7 +642,7 @@ def stitch_full( "extraction_report": {"missing": [], "uncertain": [], "notes": None}, } - for item in (C_sequences.get("items") or []): + for item in C_sequences.get("items") or []: ie = item["id_exp"] if ie not in E: continue @@ -503,16 +661,24 @@ def stitch_full( seqs["target_sequence"] = _to_oligo_full(tgt) if tgt is not None else None pr = item.get("primer_sequences") if isinstance(pr, dict): - seqs["primer_sequences"] = {"forward": _to_oligo_full(pr.get("forward")), "reverse": _to_oligo_full(pr.get("reverse"))} + seqs["primer_sequences"] = { + "forward": _to_oligo_full(pr.get("forward")), + "reverse": _to_oligo_full(pr.get("reverse")), + } else: seqs["primer_sequences"] = None rels = [] - for rs in (item.get("related_sequences") or []): - rels.append({"related_sequence": _to_oligo_full(rs.get("related_sequence")), "description": rs.get("description")}) + for rs in item.get("related_sequences") or []: + rels.append( + { + "related_sequence": _to_oligo_full(rs.get("related_sequence")), + "description": rs.get("description"), + } + ) seqs["related_sequences"] = rels E[ie]["sequences"] = seqs - for item in (D_parameters.get("items") or []): + for item in D_parameters.get("items") or []: ie = item["id_exp"] if ie not in E: continue @@ -545,7 +711,9 @@ def stitch_full( EP: Dict[str, Any] = {} concs = (item.get("experiment_properties") or {}).get("concentrations") or {} EP["concentrations"] = { - "dna_rna_concentration": _to_measurement_full(concs.get("dna_rna_concentration")), + "dna_rna_concentration": _to_measurement_full( + concs.get("dna_rna_concentration") + ), "concentration_SI": _to_measurement_full(concs.get("concentration_SI")), } pars = (item.get("experiment_properties") or {}).get("parameters_SI") or {} @@ -560,7 +728,7 @@ def stitch_full( E[ie]["metadata"] = MD E[ie]["experiment_properties"] = EP - for item in (E_outcomes.get("items") or []): + for item in E_outcomes.get("items") or []: ie = item["id_exp"] if ie not in E: continue @@ -570,7 +738,7 @@ def stitch_full( "comparative_notes": item.get("comparative_notes"), } - for item in (F_pairings.get("items") or []): + for item in F_pairings.get("items") or []: ie = item["id_exp"] if ie not in E: continue @@ -596,6 +764,7 @@ def stitch_full( "extraction_report": full_report, } + def _deep_merge_keep_left(a, b): """Shallow-friendly deep merge: keep a's non-null scalars; use b if a is None. - Dicts: recurse. @@ -621,7 +790,10 @@ def _deep_merge_keep_left(a, b): def aggregate_c_outputs(outputs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: """Build a consolidated C_sequences object from any of: C_sequences, C1_probe_core, C2_target_primers, C3_related.""" # Start with single-pass C if present - base = outputs.get("C_sequences") or {"items": [], "extraction_report": {"missing": [], "uncertain": [], "notes": None}} + base = outputs.get("C_sequences") or { + "items": [], + "extraction_report": {"missing": [], "uncertain": [], "notes": None}, + } # Build item index by id_exp from base items_map: Dict[str, Dict[str, Any]] = {} @@ -644,10 +816,20 @@ def _merge_from(pass_name: str, fields: List[str]): tgt[f] = _deep_merge_keep_left(tgt.get(f), it[f]) # merge extraction report - br = base.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None} - er = obj.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None} + br = base.get("extraction_report") or { + "missing": [], + "uncertain": [], + "notes": None, + } + er = obj.get("extraction_report") or { + "missing": [], + "uncertain": [], + "notes": None, + } br["missing"] = list((br.get("missing") or []) + (er.get("missing") or [])) - br["uncertain"] = list((br.get("uncertain") or []) + (er.get("uncertain") or [])) + br["uncertain"] = list( + (br.get("uncertain") or []) + (er.get("uncertain") or []) + ) br_notes = [n for n in [br.get("notes"), er.get("notes")] if n] br["notes"] = " | ".join(br_notes) if br_notes else None base["extraction_report"] = br @@ -666,13 +848,18 @@ def _merge_from(pass_name: str, fields: List[str]): it.setdefault("primer_sequences", None) it.setdefault("related_sequences", []) - return {"items": merged_items, "extraction_report": base.get("extraction_report") or {"missing": [], "uncertain": [], "notes": None}} + return { + "items": merged_items, + "extraction_report": base.get("extraction_report") + or {"missing": [], "uncertain": [], "notes": None}, + } # ────────────────────────────────────────────────────────────────────── # Project runner # ────────────────────────────────────────────────────────────────────── + def run_project(project_dir: str | Path) -> None: """Run the pipeline as configured by files under project_dir.""" project_dir = Path(project_dir) @@ -684,10 +871,12 @@ def run_project(project_dir: str | Path) -> None: headers = dict() if API_TOKEN is not None: - headers['Authorization'] = f'Bearer {API_TOKEN}' - + headers["Authorization"] = f"Bearer {API_TOKEN}" + # Ollama client + Outlines model - client = ollama.Client(host=cfg.ollama_base_url, timeout=cfg.timeout_s, headers=headers) + client = ollama.Client( + host=cfg.ollama_base_url, timeout=cfg.timeout_s, headers=headers + ) for model_name in cfg.model_names: model = outlines.from_ollama(client, model_name) @@ -701,7 +890,9 @@ def run_project(project_dir: str | Path) -> None: full_validator = Draft202012Validator(json.loads(full_schema_text)) logger.info("Loaded full schema for final validation.") except Exception: - logger.exception("Failed to load/parse full schema; proceeding without final validation.") + logger.exception( + "Failed to load/parse full schema; proceeding without final validation." + ) logger.info(f"Article glob: {cfg.article_glob}") @@ -716,7 +907,9 @@ def run_project(project_dir: str | Path) -> None: # Run configured pre-passes outputs: Dict[str, Dict[str, Any]] = {} - for p in tqdm(cfg.pre_passes, desc=f"{article_name} pre-passes", leave=False): + for p in tqdm( + cfg.pre_passes, desc=f"{article_name} pre-passes", leave=False + ): try: outputs[p.name] = run_single_pass( model=model, @@ -730,11 +923,21 @@ def run_project(project_dir: str | Path) -> None: model_name=model_name, ) except Exception: - logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) all_found_sequences = ", ".join(outputs["SeqPrompt_strict"]) logger.info("Pre-passes done, found sequences: " + all_found_sequences) + for seq in tqdm( + set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"]), + desc=f"{article_name}: sequences construction", + leave=False, + ): + + pass + # for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False): # try: # outputs[p.name] = run_single_pass( From f69cd91c6ce1133f4ed9beeecb01ca81cf7518c6 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:20:24 +0400 Subject: [PATCH 036/102] Adding brutal check for thinking --- extraction/config/pipeline.json | 2 +- extraction/pipeline_pre_quest.py | 118 ++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 35 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index b519f5d..1a7d6f3 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,7 +1,7 @@ { "model_names": [ - "deepseek-r1:7b-qwen-distill-q4_K_M", "myaniu/qwen2.5-1m:7b", + "deepseek-r1:7b-qwen-distill-q4_K_M", "qwen2.5-coder:3b", "llama3.1:latest", "qwen3:4b", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index eb67144..a6fccd3 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -109,6 +109,17 @@ def _opt_path(p) -> Optional[Path]: return (project_dir / p) if p else None pre_passes: List[PassConfig] = [] + for p in data["pre_passes"]: + pre_passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) + ) + + p = data["construct_single_experiment_pass"] construct_single_experiment_pass = PassConfig( name=p["name"], @@ -361,18 +372,32 @@ def run_single_pass( # tools=tools, # ): # response += chunk - response = model.generate( - prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - output_type=js, - options=ollama_parameters, - # tools=tools, # TODO: Temporarily switch tools off - think=True, - keep_alive="30s", - ) + try: + response = model.generate( + prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + output_type=js, + options=ollama_parameters, + # tools=tools, # TODO: Temporarily switch tools off + think=True, + keep_alive="30s", + ) + except ollama.ResponseError: + response = model.generate( + prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + output_type=js, + options=ollama_parameters, + # tools=tools, # TODO: Temporarily switch tools off + think=False, + keep_alive="30s", + ) except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") @@ -455,28 +480,52 @@ def run_construct_single_experiment_pass( # tools=tools, # ): # response += chunk - response = model.chat( - messages=[ - { - "role": "system", "content": prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n" - }, - { - "role": "user", - "content": "Let's describe a single nucleotide sequence!", - }, - {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, - {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, - {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, - ], - output_type=js, - options=ollama_parameters, - think=True, - keep_alive="30s", - ) + try: + response = model.chat( + messages=[ + { + "role": "system", "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n" + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, + {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, + {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, + ], + output_type=js, + options=ollama_parameters, + think=True, + keep_alive="30s", + ) + except ollama.ResponseError: + response = model.chat( + messages=[ + { + "role": "system", "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n" + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, + {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, + {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, + ], + output_type=js, + options=ollama_parameters, + think=False, + keep_alive="30s", + ) except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") @@ -878,6 +927,7 @@ def run_project(project_dir: str | Path) -> None: host=cfg.ollama_base_url, timeout=cfg.timeout_s, headers=headers ) + ollama_models = client.list() for model_name in cfg.model_names: model = outlines.from_ollama(client, model_name) tools = [to_si, parse_oligo, make_measurement] From c6da7e15867586d7765cd45bbac3d6b2780aa986 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:23:10 +0400 Subject: [PATCH 037/102] Fixed chatting function and added call to it --- extraction/pipeline_pre_quest.py | 85 ++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index a6fccd3..0e7f2b5 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -119,7 +119,6 @@ def _opt_path(p) -> Optional[Path]: ) ) - p = data["construct_single_experiment_pass"] construct_single_experiment_pass = PassConfig( name=p["name"], @@ -437,6 +436,7 @@ def run_construct_single_experiment_pass( model: Any, article_text: str, sequence: str, + sequence_id: int, pass_cfg: PassConfig, out_base: Path, article_stem: str, @@ -459,15 +459,15 @@ def run_construct_single_experiment_pass( stamp = _now_stamp() raw_txt_path = ( txt_dir - / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.txt" ) json_out_path = ( json_dir - / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.json" ) err_log_path = ( log_dir - / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.log" ) logger.info(f"[{pass_cfg.name}:{model_name}] generating …") @@ -484,19 +484,35 @@ def run_construct_single_experiment_pass( response = model.chat( messages=[ { - "role": "system", "content": prompt + "role": "system", + "content": prompt + "\n" + "And here is the article text you must base your answer on:\n\n
\n" + article_text - + "\n<\\article>\n" + + "\n<\\article>\n", }, { "role": "user", "content": "Let's describe a single nucleotide sequence!", }, - {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, - {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, - {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + json.dumps(js) + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + 'role': 'user', + 'content': sequence, + } ], output_type=js, options=ollama_parameters, @@ -507,19 +523,35 @@ def run_construct_single_experiment_pass( response = model.chat( messages=[ { - "role": "system", "content": prompt + "role": "system", + "content": prompt + "\n" + "And here is the article text you must base your answer on:\n\n
\n" + article_text - + "\n<\\article>\n" + + "\n<\\article>\n", }, { "role": "user", "content": "Let's describe a single nucleotide sequence!", }, - {'role': 'assistant', 'content': "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?"}, - {'role': 'user', 'content': "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + json.dumps(js) + "\n```\n\nIs it OK?"}, - {'role': 'assistant', 'content': "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!"}, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + json.dumps(js) + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + 'role': 'user', + 'content': sequence, + } ], output_type=js, options=ollama_parameters, @@ -980,13 +1012,26 @@ def run_project(project_dir: str | Path) -> None: all_found_sequences = ", ".join(outputs["SeqPrompt_strict"]) logger.info("Pre-passes done, found sequences: " + all_found_sequences) - for seq in tqdm( - set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"]), - desc=f"{article_name}: sequences construction", - leave=False, + for i, seq in enumerate( + tqdm( + set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"]), + desc=f"{article_name}: sequences construction", + leave=False, + ) ): - - pass + run_construct_single_experiment_pass( + model=model, + article_text=article_text, + sequence=seq, + seq_id=i, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) # for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False): # try: From dbfc1d974b27e6c2da6daa3595050dc5de0a4cc7 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:29:26 +0400 Subject: [PATCH 038/102] Update parameters to change model and step sets --- extraction/config/pipeline.json | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 1a7d6f3..f6f892a 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,21 +1,22 @@ { "model_names": [ "myaniu/qwen2.5-1m:7b", - "deepseek-r1:7b-qwen-distill-q4_K_M", + "deepseek-r1:1.5b", "qwen2.5-coder:3b", + "deepseek-r1:7b-qwen-distill-q4_K_M", "llama3.1:latest", "qwen3:4b", "myaniu/qwen2.5-1m:14b" ], "ollama_parameters": { - "num_ctx": 131072, + "num_ctx": 65536, "num_predict": 131072, - "temperature": 0.35, + "temperature": 0.15, "seed": 42 }, "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 300, - "input_dir": "input/md", + "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", "db_path": "outlines_output/massive.sqlite", @@ -32,25 +33,13 @@ "schema": "passes/_1_SeqPrompt/schema_strict.json", "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", "timeout": 60 - }, - { - "name": "Experiments", - "schema": "passes/_2_Experiments/schema.json", - "prompt": "passes/_2_Experiments/prompt.txt", - "timeout": 60 - }, - { - "name": "Experiments-strict", - "schema": "passes/_2_Experiments/schema_strict.json", - "prompt": "passes/_2_Experiments/prompt_strict.txt", - "timeout": 60 } ], "construct_single_experiment_pass": { "name": "ConstructSingleExperiment", "schema": "passes/_3_ConstructSingleExperiment/schema.json", "prompt": "passes/_3_ConstructSingleExperiment/prompt.txt", - "timeout": 60 + "timeout": 600 }, "passes": [ { From 0595c8b3d031a2d26529c25ac0476177c582cce5 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:47:59 +0400 Subject: [PATCH 039/102] Running on small set of articles --- extraction/config/pipeline.json | 8 +- extraction/pipeline_pre_quest.py | 149 ++++++++++++++++--------------- 2 files changed, 83 insertions(+), 74 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index f6f892a..12010d2 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -9,13 +9,13 @@ "myaniu/qwen2.5-1m:14b" ], "ollama_parameters": { - "num_ctx": 65536, + "num_ctx": 131072, "num_predict": 131072, - "temperature": 0.15, - "seed": 42 + "temperature": 0.2, + "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 300, + "timeout_s": 120, "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 0e7f2b5..c5d2ab2 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -481,78 +481,82 @@ def run_construct_single_experiment_pass( # ): # response += chunk try: - response = model.chat( - messages=[ - { - "role": "system", - "content": prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - }, - { - "role": "user", - "content": "Let's describe a single nucleotide sequence!", - }, - { - "role": "assistant", - "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", - }, - { - "role": "user", - "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" - + json.dumps(js) - + "\n```\n\nIs it OK?", - }, - { - "role": "assistant", - "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", - }, - { - 'role': 'user', - 'content': sequence, - } - ], + response = model.generate( + model_input=outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + js.schema + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + "role": "user", + "content": sequence, + }, + ] + ), output_type=js, options=ollama_parameters, think=True, keep_alive="30s", ) except ollama.ResponseError: - response = model.chat( - messages=[ - { - "role": "system", - "content": prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - }, - { - "role": "user", - "content": "Let's describe a single nucleotide sequence!", - }, - { - "role": "assistant", - "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", - }, - { - "role": "user", - "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" - + json.dumps(js) - + "\n```\n\nIs it OK?", - }, - { - "role": "assistant", - "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", - }, - { - 'role': 'user', - 'content': sequence, - } - ], + response = model.generate( + model_input=outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + js.schema + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + "role": "user", + "content": sequence, + }, + ] + ), output_type=js, options=ollama_parameters, think=False, @@ -1009,12 +1013,17 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - all_found_sequences = ", ".join(outputs["SeqPrompt_strict"]) - logger.info("Pre-passes done, found sequences: " + all_found_sequences) + all_found_sequences = list( + sorted( + set(set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"])) + ) + ) + all_found_sequences_str = ", ".join(all_found_sequences) + logger.info("Pre-passes done, found sequences: " + all_found_sequences_str) for i, seq in enumerate( tqdm( - set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"]), + all_found_sequences, desc=f"{article_name}: sequences construction", leave=False, ) @@ -1023,7 +1032,7 @@ def run_project(project_dir: str | Path) -> None: model=model, article_text=article_text, sequence=seq, - seq_id=i, + sequence_id=i, pass_cfg=p, out_base=out_base, article_stem=article_name, From fdd554fc5ec31210b3f61ae275d57ab4b3b3bd3e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:50:16 +0400 Subject: [PATCH 040/102] Fixed script --- extraction/pipeline_pre_quest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index c5d2ab2..538ec05 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1033,7 +1033,7 @@ def run_project(project_dir: str | Path) -> None: article_text=article_text, sequence=seq, sequence_id=i, - pass_cfg=p, + pass_cfg=cfg.construct_single_experiment_pass, out_base=out_base, article_stem=article_name, tools=tools, From 87fa67c2b4c5329c4ba27add81e14310475e28ac Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:55:58 +0400 Subject: [PATCH 041/102] Seems that per-sequence schema is broken --- .../passes/_3_ConstructSingleExperiment/schema.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/extraction/passes/_3_ConstructSingleExperiment/schema.json b/extraction/passes/_3_ConstructSingleExperiment/schema.json index 20e24b6..37961c7 100644 --- a/extraction/passes/_3_ConstructSingleExperiment/schema.json +++ b/extraction/passes/_3_ConstructSingleExperiment/schema.json @@ -344,8 +344,6 @@ "additionalProperties": false, "required": [ "doi", - "abstract", - "topic", "experiments", "extraction_report" ], @@ -364,7 +362,6 @@ "id_exp", "raw_description", "experiment_type", - "description", "metadata", "sequences", "experiment_properties", @@ -496,12 +493,6 @@ ] } }, - "description": { - "type": "string", - "minLength": 10, - "maxLength": 1000, - "description": "Concise human-readable summary of this specific target-probe experiment." - }, "metadata": { "type": "object", "additionalProperties": false, From ed87b10f94bdb0813a13c5a7aa02795b99a6a5ac Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 05:59:32 +0400 Subject: [PATCH 042/102] Fixed single experiment schema started at least --- .../_3_ConstructSingleExperiment/schema.json | 655 +++++++++--------- 1 file changed, 331 insertions(+), 324 deletions(-) diff --git a/extraction/passes/_3_ConstructSingleExperiment/schema.json b/extraction/passes/_3_ConstructSingleExperiment/schema.json index 37961c7..f13ab0b 100644 --- a/extraction/passes/_3_ConstructSingleExperiment/schema.json +++ b/extraction/passes/_3_ConstructSingleExperiment/schema.json @@ -155,7 +155,8 @@ "Pa", "kg/m^3", "s", - "dimensionless" + "dimensionless", + null ], "description": "SI unit after conversion." }, @@ -309,11 +310,17 @@ "description": "The probe's decorated oligo (sequence, labels, direction)." }, "fluorophore": { - "type": "string", + "type": [ + "string", + "null" + ], "description": "Fluorophore name if identifiable; otherwise null." }, "quencher": { - "type": "string", + "type": [ + "string", + "null" + ], "description": "Quencher name if identifiable; otherwise null." }, "sense_antisense": { @@ -339,57 +346,52 @@ } }, "properties": { - "title": "Article with experiments/probes", - "type": "object", - "additionalProperties": false, - "required": [ - "doi", - "experiments", - "extraction_report" - ], - "properties": { - "doi": { - "type": "string", - "minLength": 4, - "maxLength": 100, - "description": "Digital Object Identifier for the article." - }, - "experiment": { - "description": "Full description of a single hybridization experiment instance related to this sequence", - "type": "object", - "additionalProperties": false, - "required": [ - "id_exp", - "raw_description", - "experiment_type", - "metadata", - "sequences", - "experiment_properties", - "outcome", - "pairing", - "extraction_report" - ], - "properties": { - "id_exp": { - "type": "string", - "minLength": 1, - "maxLength": 120, - "description": "Unique experiment identifier (derive if needed from amplicon + probe name')." - }, - "raw_description": { - "type": [ - "string", - "null" - ], - "minLength": 1, - "maxLength": 1000, - "description": "Verbatim or lightly tidied description of the experiment from the article." - }, - "experiment_type": { - "type": "object", - "description": "Description of this single hybridization experiment design.", - "additionalProperties": false, - "required": ["probe_type", "chemistry"], + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "experiment": { + "description": "Full description of a single hybridization experiment instance related to this sequence", + "type": "object", + "additionalProperties": false, + "required": [ + "id_exp", + "raw_description", + "experiment_type", + "metadata", + "sequences", + "experiment_properties", + "outcome", + "pairing", + "extraction_report" + ], + "properties": { + "id_exp": { + "type": "string", + "minLength": 1, + "maxLength": 120, + "description": "Unique experiment identifier (derive if needed from amplicon + probe name')." + }, + "raw_description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "experiment_type": { + "type": "object", + "description": "Description of this single hybridization experiment design.", + "additionalProperties": false, + "required": [ + "probe_type", + "chemistry" + ], + "properties": { "probe_type": { "title": "Probe Type", "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", @@ -492,298 +494,303 @@ } ] } - }, - "metadata": { - "type": "object", - "additionalProperties": false, - "description": "High-level descriptors linked to this experiment.", - "required": [ - "organism", - "technology", - "annealing", - "pH", - "rna_impurities" - ], - "properties": { - "organism": { - "type": [ - "string", - "null" - ], - "minLength": 2, - "maxLength": 120, - "description": "Organism (e.g., 'human')." - }, - "technology": { - "type": [ - "string", - "null" - ], - "minLength": 2, - "maxLength": 120, - "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." - }, - "annealing": { - "type": [ - "object", - "null" - ], - "additionalProperties": false, - "description": "Annealing process details, with optional quantitative and qualitative components.", - "required": [ - "quantitative", - "qualitative" - ], - "properties": { - "quantitative": { - "$ref": "#/$defs/measurement", - "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." - }, - "qualitative": { - "type": [ - "boolean", - "null" - ], - "description": "If the article states a qualitative annealing outcome/criterion." - } - } - }, - "pH": { - "$ref": "#/$defs/measurement", - "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." - }, - "rna_impurities": { - "type": [ - "object", - "null" - ], - "additionalProperties": false, - "description": "RNA impurity information, if discussed.", - "required": [ - "quantitative", - "qualitative" - ], - "properties": { - "quantitative": { - "$ref": "#/$defs/measurement", - "description": "Quantity/percentage of RNA impurities." - }, - "qualitative": { - "type": [ - "boolean", - "null" - ], - "description": "Presence/absence or a qualitative statement regarding RNA impurities." - } + } + }, + "metadata": { + "type": "object", + "additionalProperties": false, + "description": "High-level descriptors linked to this experiment.", + "required": [ + "organism", + "technology", + "annealing", + "pH", + "rna_impurities" + ], + "properties": { + "organism": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Organism (e.g., 'human')." + }, + "technology": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." + }, + "annealing": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "Annealing process details, with optional quantitative and qualitative components.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." + }, + "qualitative": { + "type": [ + "boolean", + "null" + ], + "description": "If the article states a qualitative annealing outcome/criterion." } } - } - }, - "sequences": { - "type": "object", - "additionalProperties": false, - "description": "All sequences relevant to this target-probe experiment.", - "required": [ - "target_sequence", - "probe", - "primer_sequences", - "related_sequences" - ], - "properties": { - "target_sequence": { - "oneOf": [ - { - "$ref": "#/$defs/decoratedOligo" - }, - { - "type": "string", - "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", - "minLength": 70, - "maxLength": 200 - } - ], - "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." - }, - "probe": { - "$ref": "#/$defs/probe", - "description": "The hybridization probe for this experiment." - }, - "primer_sequences": { - "oneOf": [ - { - "$ref": "#/$defs/primerPair" - }, - { - "type": "null" - } - ], - "description": "PCR primers associated with this experiment/amplicon if provided." - }, - "related_sequences": { - "type": "array", - "description": "Additional related sequences (controls, references), if any.", - "minItems": 0, - "maxItems": 50, - "items": { - "type": "object", - "additionalProperties": false, - "required": [ - "related_sequence", - "description" + }, + "pH": { + "$ref": "#/$defs/measurement", + "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." + }, + "rna_impurities": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "RNA impurity information, if discussed.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Quantity/percentage of RNA impurities." + }, + "qualitative": { + "type": [ + "boolean", + "null" ], - "properties": { - "related_sequence": { - "$ref": "#/$defs/decoratedOligo", - "description": "A related sequence (plain or decorated)." - }, - "description": { - "type": [ - "string", - "null" - ], - "minLength": 1, - "maxLength": 200, - "description": "Short explanation of the related sequence's role." - } - } + "description": "Presence/absence or a qualitative statement regarding RNA impurities." } } } - }, - "experiment_properties": { - "type": "object", - "additionalProperties": false, - "description": "Quantitative and buffer parameters for this experiment.", - "required": [ - "concentrations", - "parameters_SI" - ], - "properties": { - "concentrations": { - "type": "object", - "additionalProperties": false, - "description": "Concentration-related values.", - "required": [ - "dna_rna_concentration", - "concentration_SI" - ], - "properties": { - "dna_rna_concentration": { - "$ref": "#/$defs/measurement", - "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." - }, - "concentration_SI": { - "$ref": "#/$defs/measurement", - "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." - } + } + }, + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": [ + "target_sequence", + "probe", + "primer_sequences", + "related_sequences" + ], + "properties": { + "target_sequence": { + "oneOf": [ + { + "$ref": "#/$defs/decoratedOligo" + }, + { + "type": "string", + "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", + "minLength": 70, + "maxLength": 200 + } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { + "$ref": "#/$defs/primerPair" + }, + { + "type": "null" } - }, - "parameters_SI": { + ], + "description": "PCR primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { "type": "object", "additionalProperties": false, - "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", "required": [ - "temperature", - "Tris", - "Na", - "K", - "Mg", - "DMSO" + "related_sequence", + "description" ], "properties": { - "temperature": { - "$ref": "#/$defs/measurement", - "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." - }, - "Tris": { - "$ref": "#/$defs/measurement", - "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." - }, - "Na": { - "$ref": "#/$defs/measurement", - "description": "Sodium ion concentration; SI in mol/m^3." - }, - "K": { - "$ref": "#/$defs/measurement", - "description": "Potassium ion concentration; SI in mol/m^3." + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." }, - "Mg": { - "$ref": "#/$defs/measurement", - "description": "Magnesium ion concentration; SI in mol/m^3." - }, - "DMSO": { - "$ref": "#/$defs/measurement", - "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + "description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." } } } } - }, - "outcome": { - "type": "object", - "additionalProperties": false, - "description": "Results for this target-probe pairing.", - "required": [ - "outcome", - "fluorescence", - "comparative_notes" - ], - "properties": { - "outcome": { - "type": [ - "boolean", - "null" - ], - "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." - }, - "fluorescence": { - "$ref": "#/$defs/measurement", - "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." - }, - "comparative_notes": { - "type": [ - "string", - "null" - ], - "minLength": 0, - "maxLength": 500, - "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + }, + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": [ + "concentrations", + "parameters_SI" + ], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": [ + "dna_rna_concentration", + "concentration_SI" + ], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } } - } - }, - "pairing": { - "type": "object", - "additionalProperties": false, - "description": "Optional cross-references to paired/reciprocal probes within the same article.", - "required": [ - "paired_with_probe_name", - "relationship" - ], - "properties": { - "paired_with_probe_name": { - "type": [ - "string", - "null" - ], - "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." - }, - "relationship": { - "type": [ - "string", - "null" - ], - "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": [ + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } } } - }, - "extraction_report": { - "$ref": "#/$defs/extractionReport" } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": [ + "outcome", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + }, + "pairing": { + "type": "object", + "additionalProperties": false, + "description": "Optional cross-references to paired/reciprocal probes within the same article.", + "required": [ + "paired_with_probe_name", + "relationship" + ], + "properties": { + "paired_with_probe_name": { + "type": [ + "string", + "null" + ], + "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." + }, + "relationship": { + "type": [ + "string", + "null" + ], + "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" } - }, - "extraction_report": { - "$ref": "#/$defs/extractionReport" } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" } - } + }, + "required": [ + "doi", + "experiment", + "extraction_report" + ] } \ No newline at end of file From ed7fcc93a6de3b51c4c0e60233b9196608015ec5 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 06:03:31 +0400 Subject: [PATCH 043/102] Set global timeout for 10 minutes --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 12010d2..206a1f7 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -15,7 +15,7 @@ "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 120, + "timeout_s": 600, "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", From e344736e55d3a14002f0f2cad05ec715566310a8 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 16:09:51 +0400 Subject: [PATCH 044/102] Added new construction schemas and logic, lower timeout for debugging --- extraction/config/pipeline.json | 24 +- extraction/passes/D_parameters/schema.json | 6 +- extraction/passes/_1_SeqPrompt/prompt.txt | 2 + .../passes/_1_SeqPrompt/prompt_strict.txt | 2 + .../prompt.txt | 18 + .../schema.json | 682 ++++++++++++++ .../prompt.txt | 18 + .../schema.json | 678 ++++++++++++++ .../prompt.txt | 18 + .../schema.json | 840 ++++++++++++++++++ extraction/pipeline_pre_quest.py | 53 +- 11 files changed, 2314 insertions(+), 27 deletions(-) create mode 100644 extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt create mode 100644 extraction/passes/_4_ConstructSingleSmallExperiment/schema.json create mode 100644 extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt create mode 100644 extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json create mode 100644 extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt create mode 100644 extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 206a1f7..fa7e69a 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -15,7 +15,7 @@ "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 600, + "timeout_s": 20, "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", @@ -35,12 +35,32 @@ "timeout": 60 } ], - "construct_single_experiment_pass": { + "construct_single_experiment_passes": [ + { "name": "ConstructSingleExperiment", "schema": "passes/_3_ConstructSingleExperiment/schema.json", "prompt": "passes/_3_ConstructSingleExperiment/prompt.txt", "timeout": 600 }, + { + "name": "ConstructSingleSmallExperiment", + "schema": "passes/_4_ConstructSingleSmallExperiment/schema.json", + "prompt": "passes/_4_ConstructSingleSmallExperiment/prompt.txt", + "timeout": 600 + }, + { + "name": "ConstructSingleSequenceExperiment", + "schema": "passes/_5_ConstructSingleSequenceExperiment/schema.json", + "prompt": "passes/_5_ConstructSingleSequenceExperiment/prompt.txt", + "timeout": 600 + }, + { + "name": "ConstructSingleSequenceExperimentAndOutcome", + "schema": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json", + "prompt": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt", + "timeout": 600 + } + ], "passes": [ { "name": "A_core", diff --git a/extraction/passes/D_parameters/schema.json b/extraction/passes/D_parameters/schema.json index 2856c9e..08511ad 100644 --- a/extraction/passes/D_parameters/schema.json +++ b/extraction/passes/D_parameters/schema.json @@ -100,9 +100,9 @@ "additionalProperties": false, "required": ["raw", "value", "unit"], "properties": { - "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, - "value": { "type": ["number", "null"] }, - "unit": { "type": ["string", "null"], "maxLength": 50 } + "raw": { "type": "string", "minLength": 1, "maxLength": 200, "description": "Textual value representation." }, + "value": { "type": ["number", "null"], "description": "Numeric value representation." }, + "unit": { "type": ["string", "null"], "maxLength": 50, "description": "Measurement unit for the numeric value representation." } } } } diff --git a/extraction/passes/_1_SeqPrompt/prompt.txt b/extraction/passes/_1_SeqPrompt/prompt.txt index eb602b3..b17ac4d 100644 --- a/extraction/passes/_1_SeqPrompt/prompt.txt +++ b/extraction/passes/_1_SeqPrompt/prompt.txt @@ -16,6 +16,8 @@ STRICT RULES of how do you work and response: Perform the following task: * Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. +* Be sure to extract real nucleotidic sequences in IUPAC form (with or without modification). +* But be careful and cautions and remove letter sequences unrelated to the nucleotide sequences. Articles may have other letter sequences such as an abbreviations or so. You are inly interested in nucleotidic sequences, such as the probe sequences, primers, target sequences etc. Here is the JSON schema you have to follow: ```json diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt index 96a4d96..5c0694f 100644 --- a/extraction/passes/_1_SeqPrompt/prompt_strict.txt +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -16,6 +16,8 @@ STRICT RULES of how do you work and response: Perform the following task: * Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. +* Be sure to extract real nucleotidic sequences in IUPAC form (only the part without modification, as format does not allow dashes). +* But be careful and cautions and remove letter sequences unrelated to the nucleotide sequences. Articles may have other letter sequences such as an abbreviations or so. You are inly interested in nucleotidic sequences, such as the probe sequences, primers, target sequences etc. Here is the JSON schema you have to follow: ```json diff --git a/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt b/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json b/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json new file mode 100644 index 0000000..6f41b81 --- /dev/null +++ b/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json @@ -0,0 +1,682 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + "additionalProperties": false, + "required": [ + "experiment", + "extraction_report" + ], + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": [ + "missing", + "uncertain", + "notes" + ], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, here is its description: (.*))$", + "minLength": 5, + "maxLength": 5000 + }, + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "properties": { + "source_type": { + "type": "string", + "enum": [ + "pdf", + "html", + "other", + "unknown" + ], + "description": "Type of source the extractor processed." + }, + "page": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": [ + "string", + "null" + ], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": [ + "string", + "null" + ], + "description": "Short verbatim snippet from the article that directly supports the value." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit", + "si_value", + "si_unit", + "assumptions" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": [ + "string", + "null" + ], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": [ + "number", + "null" + ], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": [ + "string", + "null" + ], + "enum": [ + "K", + "mol/m^3", + "Pa", + "kg/m^3", + "s", + "dimensionless", + "%", + "kg", + "mol", + "m", + null + ], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": [ + "string", + "null" + ], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "labels", + "sense_antisense" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": [ + "integer", + "null" + ], + "enum": [ + 3, + 5, + null + ], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { + "type": "string" + } + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": [ + "name", + "oligo", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes" + ], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": [ + "string", + "null" + ], + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + "properties": { + "experiment": { + "description": "Full description of a single hybridization experiment instance related to this sequence", + "type": "object", + "additionalProperties": false, + "required": [ + "experiment_raw_description", + "sequences", + "experiment_type", + "outcome", + "experiment_properties" + ], + "properties": { + "experiment_raw_description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": [ + "target_sequence", + "probe", + "primer_sequences", + "related_sequences" + ], + "properties": { + "target_sequence": { + "oneOf": [ + { + "$ref": "#/$defs/decoratedOligo" + }, + { + "type": "string", + "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", + "minLength": 70, + "maxLength": 200 + } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { + "$ref": "#/$defs/primerPair" + }, + { + "type": "null" + } + ], + "description": "Primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + "experiment_type": { + "type": "object", + "description": "Description of this single hybridization experiment design.", + "additionalProperties": false, + "required": [ + "probe_type", + "chemistry" + ], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + } + } + }, + + + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": [ + "concentrations", + "parameters_SI" + ], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": [ + "dna_rna_concentration", + "concentration_SI" + ], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": [ + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": [ + "outcome", + "hybridization_probability", + "specificity", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Boolean result if explicitly stated (e.g., success=true/failure=false). If not explicit, leave null." + }, + "hybridization_probability":{ + "type": "object", + "additionalProperties": false, + "required": ["numeric", "textual"], + "properties":{ + "numeric": { "type": ["number", "null"], "description": "Probability of the probe to hybridize to the target in this hybridization experiment, if stated in the article text."}, + "textual": { "type": ["string", "null"], "maxLength": 200, "description": "Explain the notes from the article regarding the probability of the probe to hybridize to the target in this hybridization experiment, even if the numeric value is not present."} + } + }, + "specificity":{ + "type": "object", + "additionalProperties": false, + "required": ["numeric", "textual"], + "properties":{ + "numeric": { "type": ["number", "null"], "description": "Target specificity of the probe in this hybridization experiment, if stated in the article text."}, + "textual": { "type": ["string", "null"], "maxLength": 200, "description": "Explain the notes from the article regarding the target specificity of the probe in this hybridization experiment, even if the numeric value is not present."} + } + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + } +} \ No newline at end of file diff --git a/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt b/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json b/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json new file mode 100644 index 0000000..cc75bad --- /dev/null +++ b/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json @@ -0,0 +1,678 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "unevaluatedProperties": false, + "required": [ + "id_exp", + "probe", + "target_sequence", + "primer_sequences", + "related_sequences" + ], + "properties": { + "id_exp": { + "type": "string" + }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes", + "oligo" + ], + "properties": { + "name": { + "type": "string", + "maxLength": 500 + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "notes": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": "string", + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + } + } + } + }, + "target_sequence": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "primer_sequences": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "reverse": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + }, + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + } + } + } + } + } +} \ No newline at end of file diff --git a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json new file mode 100644 index 0000000..20e8457 --- /dev/null +++ b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json @@ -0,0 +1,840 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "unevaluatedProperties": false, + "required": [ + "id_exp", + "probe", + "target_sequence", + "primer_sequences", + "related_sequences", + "hybridization_experiment_parameters", + "hybridization_experiment_outcome" + ], + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Textual value representation." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Numeric value representation." + }, + "unit": { + "type": [ + "string", + "null" + ], + "maxLength": 50, + "description": "Measurement unit for the numeric value representation." + } + } + } + }, + "properties": { + "id_exp": { + "type": "string" + }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes", + "oligo" + ], + "properties": { + "name": { + "type": "string", + "maxLength": 500 + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "notes": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": "string", + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + } + } + } + }, + "target_sequence": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "primer_sequences": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "reverse": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + }, + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + } + } + } + }, + "hybridization_experiment_parameters": { + "type": "object", + "additionalProperties": false, + "required": [ + "organism", + "technology", + "annealing", + "pH", + "rna_impurities", + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "organism": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + }, + "technology": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + }, + "temperature": { + "$ref": "#/$defs/measurement_lite" + }, + "Tris": { + "$ref": "#/$defs/measurement_lite" + }, + "Na": { + "$ref": "#/$defs/measurement_lite" + }, + "K": { + "$ref": "#/$defs/measurement_lite" + }, + "Mg": { + "$ref": "#/$defs/measurement_lite" + }, + "DMSO": { + "$ref": "#/$defs/measurement_lite" + }, + "annealing": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement_lite" + }, + "qualitative": { + "type": [ + "boolean", + "null" + ] + } + } + }, + { + "type": "null" + } + ] + }, + "pH": { + "$ref": "#/$defs/measurement_lite" + }, + "rna_impurities": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement_lite" + }, + "qualitative": { + "type": [ + "boolean", + "null" + ] + } + } + }, + { + "type": "null" + } + ] + } + } + }, + "hybridization_experiment_outcome": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Put true in case of successful hybridization, false if unsuccessful, null if could not be onferred from the article text." + }, + "fluorescence": { + "$ref": "#/$defs/measurement_lite", + "description": "Amount of fluorescence in this hybridization experiment." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "maxLength": 500 + } + } + } +} \ No newline at end of file diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 538ec05..ed9cbd3 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -67,7 +67,7 @@ class PipelineConfig: input_dir: Path out_dir: Path full_schema_path: Optional[Path] - construct_single_experiment_pass: Optional[PassConfig] + construct_single_experiment_passes: List[PassConfig] db_path: Optional[Path] article_glob: str pre_passes: List[PassConfig] @@ -119,13 +119,16 @@ def _opt_path(p) -> Optional[Path]: ) ) - p = data["construct_single_experiment_pass"] - construct_single_experiment_pass = PassConfig( - name=p["name"], - schema_path=project_dir / p["schema"], - prompt_path=project_dir / p["prompt"], - timeout=p.get("timeout", None), - ) + construct_single_experiment_passes = [] + for p in data["construct_single_experiment_passes"]: + construct_single_experiment_passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) + ) passes: List[PassConfig] = [] for p in data["passes"]: @@ -146,7 +149,7 @@ def _opt_path(p) -> Optional[Path]: input_dir=project_dir / data.get("input_dir", "inputs"), out_dir=project_dir / data.get("out_dir", "out"), full_schema_path=_opt_path(data.get("full_schema_path")), - construct_single_experiment_pass=construct_single_experiment_pass, + construct_single_experiment_passes=construct_single_experiment_passes, db_path=_opt_path(data.get("db_path")), article_glob=data.get("article_glob", "*.txt"), pre_passes=pre_passes, @@ -1028,19 +1031,25 @@ def run_project(project_dir: str | Path) -> None: leave=False, ) ): - run_construct_single_experiment_pass( - model=model, - article_text=article_text, - sequence=seq, - sequence_id=i, - pass_cfg=cfg.construct_single_experiment_pass, - out_base=out_base, - article_stem=article_name, - tools=tools, - logger=logger, - ollama_parameters=cfg.ollama_parameters, - model_name=model_name, - ) + for construct_pass in tqdm(cfg.construct_single_experiment_passes, desc="Construction schemas", leave=False): + try: + run_construct_single_experiment_pass( + model=model, + article_text=article_text, + sequence=seq, + sequence_id=i, + pass_cfg=cfg.construct_single_experiment_pass, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) + except Exception: + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) # for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False): # try: From 20fe9dbddb024529dcceedb512fcdee675560d40 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 16:13:10 +0400 Subject: [PATCH 045/102] Fixed copypaste bugs --- extraction/pipeline_pre_quest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index ed9cbd3..ef7da89 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1018,7 +1018,7 @@ def run_project(project_dir: str | Path) -> None: all_found_sequences = list( sorted( - set(set(outputs["SeqPrompt_strict"]).union(outputs["SeqPrompt"])) + set(set(outputs.get("SeqPrompt_strict", [])).union(outputs.get("SeqPrompt", []))) ) ) all_found_sequences_str = ", ".join(all_found_sequences) @@ -1038,7 +1038,7 @@ def run_project(project_dir: str | Path) -> None: article_text=article_text, sequence=seq, sequence_id=i, - pass_cfg=cfg.construct_single_experiment_pass, + pass_cfg=construct_pass, out_base=out_base, article_stem=article_name, tools=tools, From 5b197a921fc709adfbab52a5b30bd8d6d05af859 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 16:25:47 +0400 Subject: [PATCH 046/102] Fixed errorneous schema --- extraction/config/pipeline.json | 12 +++--- .../schema.json | 43 +++++++++++-------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index fa7e69a..6a0a60c 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -22,17 +22,17 @@ "db_path": "outlines_output/massive.sqlite", "article_glob": "**/*.md", "pre_passes": [ - { - "name": "SeqPrompt", - "schema": "passes/_1_SeqPrompt/schema.json", - "prompt": "passes/_1_SeqPrompt/prompt.txt", - "timeout": 60 - }, { "name": "SeqPrompt_strict", "schema": "passes/_1_SeqPrompt/schema_strict.json", "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", "timeout": 60 + }, + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 } ], "construct_single_experiment_passes": [ diff --git a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json index 20e8457..d4b3380 100644 --- a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json +++ b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json @@ -817,23 +817,32 @@ } }, "hybridization_experiment_outcome": { - "outcome": { - "type": [ - "boolean", - "null" - ], - "description": "Put true in case of successful hybridization, false if unsuccessful, null if could not be onferred from the article text." - }, - "fluorescence": { - "$ref": "#/$defs/measurement_lite", - "description": "Amount of fluorescence in this hybridization experiment." - }, - "comparative_notes": { - "type": [ - "string", - "null" - ], - "maxLength": 500 + "type": "object", + "additionalProperties": false, + "required": [ + "outcome", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Put true in case of successful hybridization, false if unsuccessful, null if could not be onferred from the article text." + }, + "fluorescence": { + "$ref": "#/$defs/measurement_lite", + "description": "Amount of fluorescence in this hybridization experiment." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "maxLength": 500 + } } } } From 449cf9abf95b17a1589dd54aebc858615e2716de Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Mon, 6 Oct 2025 22:53:24 +0400 Subject: [PATCH 047/102] Update timeout --- extraction/config/pipeline.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 6a0a60c..aab0cff 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -15,7 +15,7 @@ "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 20, + "timeout_s": 600, "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", From f6f331211c04423ef9e6b6dc35d42faf3ebd4e72 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:16:54 +0400 Subject: [PATCH 048/102] Added sequence descriptors calling --- extraction/config/pipeline.json | 25 +- .../passes/_1_SeqPrompt/prompt_strict.txt | 4 +- .../passes/_1_SeqPrompt/schema_strict.json | 2 +- extraction/passes/common.txt | 5 +- extraction/pipeline_pre_quest.py | 538 ++++++++++++++---- 5 files changed, 420 insertions(+), 154 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index aab0cff..1e6dceb 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -19,6 +19,7 @@ "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", + "common_prompt_path": "passes/common.txt", "db_path": "outlines_output/massive.sqlite", "article_glob": "**/*.md", "pre_passes": [ @@ -27,33 +28,9 @@ "schema": "passes/_1_SeqPrompt/schema_strict.json", "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", "timeout": 60 - }, - { - "name": "SeqPrompt", - "schema": "passes/_1_SeqPrompt/schema.json", - "prompt": "passes/_1_SeqPrompt/prompt.txt", - "timeout": 60 } ], "construct_single_experiment_passes": [ - { - "name": "ConstructSingleExperiment", - "schema": "passes/_3_ConstructSingleExperiment/schema.json", - "prompt": "passes/_3_ConstructSingleExperiment/prompt.txt", - "timeout": 600 - }, - { - "name": "ConstructSingleSmallExperiment", - "schema": "passes/_4_ConstructSingleSmallExperiment/schema.json", - "prompt": "passes/_4_ConstructSingleSmallExperiment/prompt.txt", - "timeout": 600 - }, - { - "name": "ConstructSingleSequenceExperiment", - "schema": "passes/_5_ConstructSingleSequenceExperiment/schema.json", - "prompt": "passes/_5_ConstructSingleSequenceExperiment/prompt.txt", - "timeout": 600 - }, { "name": "ConstructSingleSequenceExperimentAndOutcome", "schema": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json", diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt index 5c0694f..77280e4 100644 --- a/extraction/passes/_1_SeqPrompt/prompt_strict.txt +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -26,13 +26,13 @@ Here is the JSON schema you have to follow: "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 0, + "minItems": 1, "maxItems": 1000, "items": { "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "pattern": "^5'([A-Z0-9\\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\\-_()]*)3'$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index 570e432..7dfc7c3 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -9,7 +9,7 @@ "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "pattern": "^5'([A-Z0-9\\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\\-_()]*)3'$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } \ No newline at end of file diff --git a/extraction/passes/common.txt b/extraction/passes/common.txt index 8aa9e17..345cb7a 100644 --- a/extraction/passes/common.txt +++ b/extraction/passes/common.txt @@ -1,6 +1,7 @@ -You are an information-extraction model. +You are the State-of-the-Art information extraction model. You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". STRICT RULES of how do you work and response: * Never invent values; use `null` when unknown. @@ -13,5 +14,3 @@ STRICT RULES of how do you work and response: * Use ONLY English language and Latin script, only ASCII. * Output only a single JSON object that conforms to the provided JSON Schema. * For the perfect result compliant to all constraints and limitations I will tip $2000! - -Perform the following tasks for JSON extraction: diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index ef7da89..e38360a 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -67,6 +67,7 @@ class PipelineConfig: input_dir: Path out_dir: Path full_schema_path: Optional[Path] + common_prompt_path: Optional[Path] construct_single_experiment_passes: List[PassConfig] db_path: Optional[Path] article_glob: str @@ -149,6 +150,7 @@ def _opt_path(p) -> Optional[Path]: input_dir=project_dir / data.get("input_dir", "inputs"), out_dir=project_dir / data.get("out_dir", "out"), full_schema_path=_opt_path(data.get("full_schema_path")), + common_prompt_path=_opt_path(data.get("common_prompt_path")), construct_single_experiment_passes=construct_single_experiment_passes, db_path=_opt_path(data.get("db_path")), article_glob=data.get("article_glob", "*.txt"), @@ -328,6 +330,34 @@ def _now_stamp() -> str: return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") +def think_generate( + model: outlines.models.ollama.Ollama, + model_input: outlines.inputs.Chat | str | list, + logger: logging.Logger, + output_type: Optional[Any] = None, + think: bool = True, + **kwargs: Any, +) -> str: + if think: + try: + logger.debug("Trying thinking mode") + response = model.generate( + model_input=model_input, output_type=output_type, think=True, **kwargs + ) + return response + except ollama.ResponseError as e: + logger.exception( + f"Seems that model {model.model_name} does not support thinking.", e + ) + + logger.debug("Trying non-thinking mode") + response = model.generate( + model_input=model_input, output_type=output_type, think=False, **kwargs + ) + + return response + + def run_single_pass( model: Any, article_text: str, @@ -367,39 +397,17 @@ def run_single_pass( logger.info(f"[{pass_cfg.name}:{model_name}] generating …") response = "" try: - # for chunk in model.stream( - # prompt + "\n\n" + article_text, - # output_type=js, - # options=ollama_parameters, - # tools=tools, - # ): - # response += chunk - try: - response = model.generate( - prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - output_type=js, - options=ollama_parameters, - # tools=tools, # TODO: Temporarily switch tools off - think=True, - keep_alive="30s", - ) - except ollama.ResponseError: - response = model.generate( - prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - output_type=js, - options=ollama_parameters, - # tools=tools, # TODO: Temporarily switch tools off - think=False, - keep_alive="30s", - ) + response = think_generate( + model=model, + model_input=prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + output_type=js, + options=ollama_parameters, + keep_alive="30s", + ) except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") @@ -476,95 +484,47 @@ def run_construct_single_experiment_pass( logger.info(f"[{pass_cfg.name}:{model_name}] generating …") response = "" try: - # for chunk in model.stream( - # prompt + "\n\n" + article_text, - # output_type=js, - # options=ollama_parameters, - # tools=tools, - # ): - # response += chunk - try: - response = model.generate( - model_input=outlines.inputs.Chat( - [ - { - "role": "system", - "content": prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - }, - { - "role": "user", - "content": "Let's describe a single nucleotide sequence!", - }, - { - "role": "assistant", - "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", - }, - { - "role": "user", - "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" - + js.schema - + "\n```\n\nIs it OK?", - }, - { - "role": "assistant", - "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", - }, - { - "role": "user", - "content": sequence, - }, - ] - ), - output_type=js, - options=ollama_parameters, - think=True, - keep_alive="30s", - ) - except ollama.ResponseError: - response = model.generate( - model_input=outlines.inputs.Chat( - [ - { - "role": "system", - "content": prompt - + "\n" - + "And here is the article text you must base your answer on:\n\n
\n" - + article_text - + "\n<\\article>\n", - }, - { - "role": "user", - "content": "Let's describe a single nucleotide sequence!", - }, - { - "role": "assistant", - "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", - }, - { - "role": "user", - "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" - + js.schema - + "\n```\n\nIs it OK?", - }, - { - "role": "assistant", - "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", - }, - { - "role": "user", - "content": sequence, - }, - ] - ), - output_type=js, - options=ollama_parameters, - think=False, - keep_alive="30s", - ) + response = think_generate( + model=model, + model_input=outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + js.schema + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + "role": "user", + "content": sequence, + }, + ] + ), + output_type=js, + options=ollama_parameters, + keep_alive="30s", + ) + except Exception as e: logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") @@ -600,6 +560,304 @@ def run_construct_single_experiment_pass( return obj +def run_query_model( + model: Any, + article_text: str, + sequences: List[str], + out_base: Path, + article_stem: str, + common_prompt_path: Path, + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, +) -> Dict[str, Any]: + """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + pass_name = "query_chat" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + prompt = common_prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_log_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log.json" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_name}:{model_name}] generating …") + + def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): + response = "" + try: + chat_messages.add_user_message( + "Identify nucleotide sequences of all hybridization probes present in the whole article text, please. Provide your answer as a JSON array. Use only capital Latin letters, dash, parentheses, apostrophy and digits. Each item of the array must only be the nucleotide hybridization probe sequence." + ) + response = think_generate( + model=model, + model_input=chat_messages, + output_type=schema, + options=ollama_parameters, + keep_alive="30s", + ) + except Exception as e: + logger.exception(f"[{pass_name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(response) + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_name}:{model_name}] JSON parse error") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) + raise + + return obj + + base_chat = outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + } + ] + ) + answers = [] + + try: + + def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): + chat = outlines.inputs.Chat(base_chat.messages) + questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ + ("is_seq", "Is it a probe sequence?", {"type": "boolean"}), + ( + "sequence_normalized", + "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'([A-Z0-9\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_()]*)3'$", + }, + ), + ( + "sequence_expanded", + "Provide this probe sequence in expanded IUPAC format (with all repeats expanded): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + }, + ), + ( + "sequence_backbone", + "Now provide only the probe sequence body from 5' to 3', without any fluorophores, modifications and quenchers. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN]{5,})-3'$", + }, + ), + ( + "target_raw", + "Describe the target to which this probe was designed to hybridize.", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "Now provide the target sequence to which this probe should hybridize, from 5' to 3'. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable or if the exact sequence is not present in the article text.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + }, + ), + ( + "primers", + "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + }, + }, + }, + ), + ( + "pH", + "Describe the pH in this experiment. Provide the number or null, if this information is not present in the article text.", + {"type": ["number", "null"]}, + ), + ( + "annealing_raw", + "Describe the annealing in this experiment. Provide the raw description string or null, if this information is not present in the article text.", + {"type": ["string", "null"], "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Describe the melting temperature in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Describe the amounit of Tris in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Describe the amounit of Na (Sodium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Describe the amounit of K (Potassium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Describe the amounit of Mg (Magnesium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Describe the amounit of DMSO in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Describe the outcome of this hybridization experiment based on the article text. Put true in case of successful hybridization of this probe to target, put false in case of unsuccessful and put null if this information is not present in the article.", + {"type": ["boolean", "null"]}, + ), + ] + + seq_desc: Dict[str, Any] = dict() + + for param, query, schema in tqdm( + questions_to_schema, desc="Questions to the sequence", leave=False + ): + chat.add_user_message(query) + response = ask_with_schema( + chat_messages=chat, schema=JsonSchema(schema) + ) + answers.append({"seq": seq, "param": param, "response": response}) + seq_desc[param] = response + chat.add_assistant_message(response) + return seq_desc + + described_sequences: Dict[str, Dict[str, Any]] = dict() + for seq in tqdm(sequences, desc="Found sequences", leave=False): + try: + sequence_descriptor = parse_sequence(seq, base_chat=base_chat) + described_sequences[seq] = sequence_descriptor + answers.append( + {"sequence": seq, "sequence_descriptor": sequence_descriptor} + ) + except Exception as e: + logger.exception( + f'[{pass_name}:{model_name}] Sequence "{seq}" error computing descriptor' + ) + err_log_path.write_text( + f'[{pass_name}:{model_name}] Sequence "{seq}" error computing descriptor', + encoding="utf-8", + ) + finally: + json_log_path.write_text( + json.dumps(answers, indent=2, ensure_ascii=False), encoding="utf-8" + ) + json_out_path.write_text( + json.dumps(described_sequences, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + return described_sequences + + # ────────────────────────────────────────────────────────────────────── # Stitcher (to your full object) # ────────────────────────────────────────────────────────────────────── @@ -1018,12 +1276,40 @@ def run_project(project_dir: str | Path) -> None: all_found_sequences = list( sorted( - set(set(outputs.get("SeqPrompt_strict", [])).union(outputs.get("SeqPrompt", []))) + set( + set(outputs.get("SeqPrompt_strict", [])).union( + outputs.get("SeqPrompt", []) + ) + ) ) ) all_found_sequences_str = ", ".join(all_found_sequences) logger.info("Pre-passes done, found sequences: " + all_found_sequences_str) + sequence_descriptors = run_query_model( + model=model, + article_text=article_text, + sequences=all_found_sequences, + out_base=out_base, + article_stem=article_name, + common_prompt_path=cfg.common_prompt_path, + tools=tools, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-FULL__{stamp}.json" + ) + full_seq_desc_path.write_text( + json.dumps(sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + for i, seq in enumerate( tqdm( all_found_sequences, @@ -1031,7 +1317,11 @@ def run_project(project_dir: str | Path) -> None: leave=False, ) ): - for construct_pass in tqdm(cfg.construct_single_experiment_passes, desc="Construction schemas", leave=False): + for construct_pass in tqdm( + cfg.construct_single_experiment_passes, + desc="Construction schemas", + leave=False, + ): try: run_construct_single_experiment_pass( model=model, From 68fdaae69107ff393e3b22704bc2cf60375a2a3e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:18:06 +0400 Subject: [PATCH 049/102] Bug fixed --- extraction/pipeline_pre_quest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index e38360a..233671b 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1293,7 +1293,6 @@ def run_project(project_dir: str | Path) -> None: out_base=out_base, article_stem=article_name, common_prompt_path=cfg.common_prompt_path, - tools=tools, ollama_parameters=cfg.ollama_parameters, model_name=model_name, ) From 13878516e1b76a24704d94719b627e5702a51be6 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:18:36 +0400 Subject: [PATCH 050/102] One more typo fixed --- extraction/pipeline_pre_quest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 233671b..ee0027c 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1294,6 +1294,7 @@ def run_project(project_dir: str | Path) -> None: article_stem=article_name, common_prompt_path=cfg.common_prompt_path, ollama_parameters=cfg.ollama_parameters, + logger=logger, model_name=model_name, ) From 8281a9cf03a2042b60710dd1e64c1759f768cd75 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:19:06 +0400 Subject: [PATCH 051/102] More typos fixed --- extraction/pipeline_pre_quest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index ee0027c..72c668c 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -406,6 +406,7 @@ def run_single_pass( + "\n<\\article>\n", output_type=js, options=ollama_parameters, + logger=logger, keep_alive="30s", ) except Exception as e: From 11148fd78979a8342c05c9747eb02f08a350562b Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:24:15 +0400 Subject: [PATCH 052/102] Should have fixed regex error --- extraction/passes/_1_SeqPrompt/prompt_strict.txt | 2 +- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- extraction/pipeline_pre_quest.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt index 77280e4..c697bb7 100644 --- a/extraction/passes/_1_SeqPrompt/prompt_strict.txt +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -32,7 +32,7 @@ Here is the JSON schema you have to follow: "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^5'([A-Z0-9\\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\\-_()]*)3'$", + "pattern": "^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index 7dfc7c3..d5a21fa 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -9,7 +9,7 @@ "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^5'([A-Z0-9\\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\\-_()]*)3'$", + "pattern": "^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } \ No newline at end of file diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 72c668c..5917efe 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -672,7 +672,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", }, ), ( @@ -697,7 +697,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", }, ), ( @@ -712,13 +712,13 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", }, "reverse": { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9\-_'()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_'()]*)3'$", + "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", }, }, }, From 1408f92078872dc90c46a3cd19bc7402e17c8411 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:30:44 +0400 Subject: [PATCH 053/102] Yet again update prompts --- extraction/passes/_1_SeqPrompt/prompt_strict.txt | 2 +- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- extraction/pipeline_pre_quest.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt index c697bb7..4d3441d 100644 --- a/extraction/passes/_1_SeqPrompt/prompt_strict.txt +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -32,7 +32,7 @@ Here is the JSON schema you have to follow: "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": "^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index d5a21fa..cbe8dc1 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -9,7 +9,7 @@ "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": "^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } \ No newline at end of file diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 5917efe..6813866 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -662,7 +662,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9\-_()]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9\-_()]*)3'$", + "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", }, ), ( @@ -682,7 +682,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([ACGUTRYSWKMBDHVN]{5,})-3'$", + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", }, ), ( @@ -697,7 +697,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", }, ), ( @@ -712,13 +712,13 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", }, "reverse": { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", }, }, }, From d78d6acec9421779f398a7bcab478696edf390d6 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:34:57 +0400 Subject: [PATCH 054/102] Make strict schema less strict --- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index cbe8dc1..e393a55 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -9,7 +9,7 @@ "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": "^[A-Z0-9()'-]*[ACGUTRYSWKMBDHVN]{5,}[A-Z0-9()'-]*$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } \ No newline at end of file From d182f4815696aa74b2f84983a503b7e5b8bc7e17 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:37:42 +0400 Subject: [PATCH 055/102] Update schema and logging --- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- extraction/pipeline_pre_quest.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index e393a55..284d951 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -3,7 +3,7 @@ "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 1, + "minItems": 0, "maxItems": 1000, "items": { "type": "string", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 6813866..8df90d1 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -345,9 +345,9 @@ def think_generate( model_input=model_input, output_type=output_type, think=True, **kwargs ) return response - except ollama.ResponseError as e: - logger.exception( - f"Seems that model {model.model_name} does not support thinking.", e + except ollama.ResponseError: + logger.warning( + f"Seems that model {model.model_name} does not support thinking." ) logger.debug("Trying non-thinking mode") @@ -521,6 +521,7 @@ def run_construct_single_experiment_pass( }, ] ), + logger=logger, output_type=js, options=ollama_parameters, keep_alive="30s", From 2e6fb6f3a5488632a95cbb42b3bb19f37f1a35cc Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:38:22 +0400 Subject: [PATCH 056/102] Fixed missing logger --- extraction/pipeline_pre_quest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 8df90d1..aa66c65 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -614,6 +614,7 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): model_input=chat_messages, output_type=schema, options=ollama_parameters, + logger=logger, keep_alive="30s", ) except Exception as e: From 9536ad2abbbf471caf22bac6090b2c7391fdb28d Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:41:59 +0400 Subject: [PATCH 057/102] Roll-back schema, remove bools from schemas in queries --- extraction/passes/_1_SeqPrompt/schema_strict.json | 2 +- extraction/pipeline_pre_quest.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json index 284d951..e393a55 100644 --- a/extraction/passes/_1_SeqPrompt/schema_strict.json +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -3,7 +3,7 @@ "title": "AllSequences", "description": "All DNA, RNA and other sequences present in article", "type": "array", - "minItems": 0, + "minItems": 1, "maxItems": 1000, "items": { "type": "string", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index aa66c65..e922b82 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -707,7 +707,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", { "type": "object", - "additionalProperties": False, + "additionalProperties": "false", "required": ["forward", "reverse"], "properties": { "forward": { @@ -740,7 +740,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the melting temperature in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -753,7 +753,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the amounit of Tris in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -766,7 +766,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the amounit of Na (Sodium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -779,7 +779,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the amounit of K (Potassium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -792,7 +792,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the amounit of Mg (Magnesium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -805,7 +805,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the amounit of DMSO in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", { "type": ["object", "null"], - "additionalProperties": False, + "additionalProperties": "false", "required": ["value", "unit"], "properties": { "value": {"type": "number"}, From 2ca6087aaebbdd94c053172cf402e2cb5ea03f93 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:47:17 +0400 Subject: [PATCH 058/102] Temporarily removed construction prompts --- extraction/config/pipeline.json | 9 +-------- extraction/passes/_1_SeqPrompt/prompt_strict.txt | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 1e6dceb..2499b5c 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -30,14 +30,7 @@ "timeout": 60 } ], - "construct_single_experiment_passes": [ - { - "name": "ConstructSingleSequenceExperimentAndOutcome", - "schema": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json", - "prompt": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt", - "timeout": 600 - } - ], + "construct_single_experiment_passes": [], "passes": [ { "name": "A_core", diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt index 4d3441d..3b632f6 100644 --- a/extraction/passes/_1_SeqPrompt/prompt_strict.txt +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -32,7 +32,7 @@ Here is the JSON schema you have to follow: "type": "string", "minLength": 5, "maxLength": 150, - "pattern": "^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": "^[A-Z0-9()'-]*[ACGUTRYSWKMBDHVN]{5,}[A-Z0-9()'-]*$", "description": "A single sequence out of all the DNA, RNA and other sequences from the article." } } From 9dd27045f5e80ba67ffda81a78b13e3b143a5b3d Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:51:07 +0400 Subject: [PATCH 059/102] Fixed bug in ask with schema --- extraction/pipeline_pre_quest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index e922b82..e52859c 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -635,7 +635,7 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): ) raise - return obj + return obj, response base_chat = outlines.inputs.Chat( [ @@ -656,7 +656,7 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): chat = outlines.inputs.Chat(base_chat.messages) questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ - ("is_seq", "Is it a probe sequence?", {"type": "boolean"}), + ("is_seq", "Is it a probe sequence or a part of probe sequence in this article text?", {"type": "boolean"}), ( "sequence_normalized", "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", @@ -826,12 +826,12 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): questions_to_schema, desc="Questions to the sequence", leave=False ): chat.add_user_message(query) - response = ask_with_schema( + response, raw = ask_with_schema( chat_messages=chat, schema=JsonSchema(schema) ) answers.append({"seq": seq, "param": param, "response": response}) seq_desc[param] = response - chat.add_assistant_message(response) + chat.add_assistant_message(raw) return seq_desc described_sequences: Dict[str, Dict[str, Any]] = dict() From 2dcfa32974d914d1274c8c9f69b6d4a33aa90d04 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:54:35 +0400 Subject: [PATCH 060/102] Make logging easier --- extraction/pipeline_pre_quest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index e52859c..25c7574 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -623,7 +623,9 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): raise with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> {chat_messages.messages[-1]}\n<") f.write(response) + f.write("\n\n") try: fixed = repair_json(response) From 1b36bbb8fb8a11cc6428c44818722cc5353ae27b Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 17:58:17 +0400 Subject: [PATCH 061/102] Bug fixed --- extraction/pipeline_pre_quest.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 25c7574..3a2ff2b 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -606,9 +606,6 @@ def run_query_model( def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): response = "" try: - chat_messages.add_user_message( - "Identify nucleotide sequences of all hybridization probes present in the whole article text, please. Provide your answer as a JSON array. Use only capital Latin letters, dash, parentheses, apostrophy and digits. Each item of the array must only be the nucleotide hybridization probe sequence." - ) response = think_generate( model=model, model_input=chat_messages, @@ -623,7 +620,7 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): raise with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> {chat_messages.messages[-1]}\n<") + f.write(f"> {chat_messages.messages[-1]}\n< ") f.write(response) f.write("\n\n") From 19927119cda0174958a266b47fce406ffc8f0b5e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 18:04:28 +0400 Subject: [PATCH 062/102] Fixed passing the wrong chat --- extraction/pipeline_pre_quest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 3a2ff2b..7107f81 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -655,7 +655,7 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): chat = outlines.inputs.Chat(base_chat.messages) questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ - ("is_seq", "Is it a probe sequence or a part of probe sequence in this article text?", {"type": "boolean"}), + ("is_seq", "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text?", {"type": "boolean"}), ( "sequence_normalized", "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", @@ -835,8 +835,12 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): described_sequences: Dict[str, Dict[str, Any]] = dict() for seq in tqdm(sequences, desc="Found sequences", leave=False): + base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) + base_chat_with_sequence.add_user_message("Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages.") + base_chat_with_sequence.add_assistant_message(seq) + base_chat_with_sequence.add_user_message(f"Great choice! Let's analyze nucleotidic sequence {seq} for the rest of this chat!") try: - sequence_descriptor = parse_sequence(seq, base_chat=base_chat) + sequence_descriptor = parse_sequence(seq, base_chat=base_chat_with_sequence) described_sequences[seq] = sequence_descriptor answers.append( {"sequence": seq, "sequence_descriptor": sequence_descriptor} From 7c34f426405ccfbb6ab546cf83e139c2d1be0f69 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 18:18:25 +0400 Subject: [PATCH 063/102] Added query for the modifications in the prompt, as well as fluorophores and quenchers --- extraction/pipeline_pre_quest.py | 66 +++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 7107f81..9c775ab 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -655,7 +655,20 @@ def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): chat = outlines.inputs.Chat(base_chat.messages) questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ - ("is_seq", "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text?", {"type": "boolean"}), + ( + "is_seq", + "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text?", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Provide this sequence fully as a probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + }, + ), ( "sequence_normalized", "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", @@ -663,7 +676,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": r"^5'-(([A-Z0-9_()-]*)-)?([ACGUTRYSWKMBDHVN0-9()]{5,})(-([A-Z0-9_()-]*))?-(3')?$", }, ), ( @@ -673,7 +686,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'([A-Z0-9_()-]*)-([ACGUTRYSWKMBDHVN]{5,})-([A-Z0-9_()-]*)3'$", + "pattern": r"^5'-(([A-Z0-9_()-]*)-)?([ACGUTRYSWKMBDHVN0]{5,})(-([A-Z0-9_()-]*))?-(3')?$", }, ), ( @@ -686,6 +699,41 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", }, ), + ( + "fluorophore", + "Provide the fluorophore of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Provide the quencher of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Now provide the modifications of the probe sequence as an array, where each element is a modification and its position in 5'-3' direction. Use Latin letters, digits and dashes, you may also use parentheses and apostrophy. Provide an empty array if not present in the article text.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "string", + "minLength": 1, + "maxLength": 30, + "pattern": r"^[a-zA-Z0-9()'-]$", + }, + }, + ), ( "target_raw", "Describe the target to which this probe was designed to hybridize.", @@ -836,11 +884,17 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): described_sequences: Dict[str, Dict[str, Any]] = dict() for seq in tqdm(sequences, desc="Found sequences", leave=False): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) - base_chat_with_sequence.add_user_message("Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages.") + base_chat_with_sequence.add_user_message( + "Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages." + ) base_chat_with_sequence.add_assistant_message(seq) - base_chat_with_sequence.add_user_message(f"Great choice! Let's analyze nucleotidic sequence {seq} for the rest of this chat!") + base_chat_with_sequence.add_user_message( + f"Great choice! Let's analyze nucleotidic sequence {seq} for the rest of this chat!" + ) try: - sequence_descriptor = parse_sequence(seq, base_chat=base_chat_with_sequence) + sequence_descriptor = parse_sequence( + seq, base_chat=base_chat_with_sequence + ) described_sequences[seq] = sequence_descriptor answers.append( {"sequence": seq, "sequence_descriptor": sequence_descriptor} From 8b6608dd1de26dc0d6ddcb4e9357ab97a425da86 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 18:42:04 +0400 Subject: [PATCH 064/102] Updated schema ans queries, it now works --- extraction/config/pipeline.json | 2 +- extraction/pipeline_pre_quest.py | 94 ++++++++++++++++++++++---------- 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 2499b5c..d240995 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -15,7 +15,7 @@ "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 600, + "timeout_s": 60, "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "out_dir": "outlines_output_pre", "full_schema_path": "schema/json/article.json", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 9c775ab..469e4a2 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -676,17 +676,17 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-(([A-Z0-9_()-]*)-)?([ACGUTRYSWKMBDHVN0-9()]{5,})(-([A-Z0-9_()-]*))?-(3')?$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), ( "sequence_expanded", - "Provide this probe sequence in expanded IUPAC format (with all repeats expanded): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + "Provide this probe sequence in expanded IUPAC format (with all repeats expanded and no parentheses in the probe sequence backbone body): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-(([A-Z0-9_()-]*)-)?([ACGUTRYSWKMBDHVN0]{5,})(-([A-Z0-9_()-]*))?-(3')?$", + "pattern": r"^5'-([a-zA-Z0-9_'-]*-)?([a-zA-Z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9']*?)(-[a-zA-Z0-9_'-]*)?-3'$", }, ), ( @@ -699,6 +699,16 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", }, ), + ( + "sequence_backbone_expanded", + "Now provide only the expanded probe sequence body from 5' to 3' with all repeats expanded, without any fluorophores, modifications and quenchers. Use capital Latin letters, digits, dashes and apostrophy. Only the expanded backbone of probe sequence body. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), ( "fluorophore", "Provide the fluorophore of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", @@ -727,10 +737,29 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "minItems": 0, "maxItems": 150, "items": { - "type": "string", - "minLength": 1, - "maxLength": 30, - "pattern": r"^[a-zA-Z0-9()'-]$", + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": { + "type": "integer", + "minimum": 1, + }, + "modification_type": { + "type": "string", + "maxLength": 100, + "minLength": 1, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, }, }, ), @@ -754,7 +783,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", { "type": "object", - "additionalProperties": "false", + "additionalProperties": False, "required": ["forward", "reverse"], "properties": { "forward": { @@ -774,20 +803,20 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "pH", - "Describe the pH in this experiment. Provide the number or null, if this information is not present in the article text.", + "Describe the pH in this experiment. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", {"type": ["number", "null"]}, ), ( "annealing_raw", - "Describe the annealing in this experiment. Provide the raw description string or null, if this information is not present in the article text.", - {"type": ["string", "null"], "minLength": 10, "maxLength": 250}, + "Describe the annealing in this experiment. Provide the raw description string. If that's can't be inferred from the whole article text, explain why.", + {"type": ["string"], "minLength": 10, "maxLength": 250}, ), ( "T", - "Describe the melting temperature in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the melting temperature in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -797,10 +826,10 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "Tris", - "Describe the amounit of Tris in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the amount of Tris in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -810,10 +839,10 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "Na", - "Describe the amounit of Na (Sodium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the amount of Na (Sodium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -823,10 +852,10 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "K", - "Describe the amounit of K (Potassium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the amount of K (Potassium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -836,10 +865,10 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "Mg", - "Describe the amounit of Mg (Magnesium) in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the amount of Mg (Magnesium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -849,10 +878,10 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ), ( "DMSO", - "Describe the amounit of DMSO in this experiment and provide the measurement unit. Provide the number or null, if this information is not present in the article text.", + "Describe the amount of DMSO in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", { "type": ["object", "null"], - "additionalProperties": "false", + "additionalProperties": False, "required": ["value", "unit"], "properties": { "value": {"type": "number"}, @@ -872,13 +901,18 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): for param, query, schema in tqdm( questions_to_schema, desc="Questions to the sequence", leave=False ): - chat.add_user_message(query) - response, raw = ask_with_schema( - chat_messages=chat, schema=JsonSchema(schema) - ) - answers.append({"seq": seq, "param": param, "response": response}) - seq_desc[param] = response - chat.add_assistant_message(raw) + try: + chat.add_user_message(query) + response, raw = ask_with_schema( + chat_messages=chat, schema=JsonSchema(schema) + ) + answers.append({"seq": seq, "param": param, "response": response}) + seq_desc[param] = response + chat.add_assistant_message(raw) + except Exception as e: + logger.exception( + f"Exception on sequence {seq} during query: {query}", e + ) return seq_desc described_sequences: Dict[str, Dict[str, Any]] = dict() From 844312d3f01bbadf2c1eba872e1f02ae740d488e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 18:47:33 +0400 Subject: [PATCH 065/102] More choice for LLM to parse sequences --- extraction/config/pipeline.json | 6 ++++++ extraction/pipeline_pre_quest.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index d240995..d90ca92 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -23,6 +23,12 @@ "db_path": "outlines_output/massive.sqlite", "article_glob": "**/*.md", "pre_passes": [ + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + }, { "name": "SeqPrompt_strict", "schema": "passes/_1_SeqPrompt/schema_strict.json", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 469e4a2..aef57f8 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -657,7 +657,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ ( "is_seq", - "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text?", + "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text? Put true here if and only if this sequence is being described and presented as a hybridization probe. If that's a random abbreviation or nucleotide-looking string which is not a hybridization probe or otherwise not a hybridization probe, put false here.", {"type": "boolean"}, ), ( @@ -902,7 +902,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): questions_to_schema, desc="Questions to the sequence", leave=False ): try: - chat.add_user_message(query) + chat.add_user_message(query + "\nAnd here is the schema yout answer has to follow:\n```json\n" + json.dumps(schema) + "```\n") response, raw = ask_with_schema( chat_messages=chat, schema=JsonSchema(schema) ) From e79fc0b0041e005925e4e79219be677f86127520 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 18:56:54 +0400 Subject: [PATCH 066/102] Improve tqdm progressbar --- extraction/pipeline_pre_quest.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index aef57f8..7ba1294 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -572,6 +572,7 @@ def run_query_model( logger: logging.Logger, ollama_parameters: Dict[str, Any], model_name: str, + tqdm_position: int = 0, ) -> Dict[str, Any]: """Run one pass (schema+prompt from files), save raw+json+log, return object.""" pass_name = "query_chat" @@ -899,7 +900,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): seq_desc: Dict[str, Any] = dict() for param, query, schema in tqdm( - questions_to_schema, desc="Questions to the sequence", leave=False + questions_to_schema, desc="Questions to the sequence", position=tqdm_position+1 ): try: chat.add_user_message(query + "\nAnd here is the schema yout answer has to follow:\n```json\n" + json.dumps(schema) + "```\n") @@ -916,7 +917,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): return seq_desc described_sequences: Dict[str, Dict[str, Any]] = dict() - for seq in tqdm(sequences, desc="Found sequences", leave=False): + for seq in tqdm(sequences, desc="Found sequences", position=tqdm_position): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) base_chat_with_sequence.add_user_message( "Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages." @@ -1319,7 +1320,7 @@ def run_project(project_dir: str | Path) -> None: ) ollama_models = client.list() - for model_name in cfg.model_names: + for model_name in tqdm(cfg.model_names, desc="LLM Models", position=0): model = outlines.from_ollama(client, model_name) tools = [to_si, parse_oligo, make_measurement] @@ -1338,10 +1339,10 @@ def run_project(project_dir: str | Path) -> None: logger.info(f"Article glob: {cfg.article_glob}") # Iterate input articles - files = sorted(cfg.input_dir.glob(cfg.article_glob)) + files = sorted(cfg.input_dir.glob(cfg.article_glob), key=lambda s: str(s).upper()) logger.info(f"Files: {files}") - for art_path in tqdm(files, desc="Articles"): + for art_path in tqdm(files, desc="Articles", position=1): article_name = art_path.stem logger.info(f"=== {article_name} : {model_name} ===") article_text = art_path.read_text(encoding="utf-8") @@ -1349,7 +1350,7 @@ def run_project(project_dir: str | Path) -> None: # Run configured pre-passes outputs: Dict[str, Dict[str, Any]] = {} for p in tqdm( - cfg.pre_passes, desc=f"{article_name} pre-passes", leave=False + cfg.pre_passes, desc=f"{article_name} pre-passes", position=2 ): try: outputs[p.name] = run_single_pass( @@ -1390,6 +1391,7 @@ def run_project(project_dir: str | Path) -> None: ollama_parameters=cfg.ollama_parameters, logger=logger, model_name=model_name, + tqdm_position=3, ) stamp = _now_stamp() @@ -1409,7 +1411,8 @@ def run_project(project_dir: str | Path) -> None: all_found_sequences, desc=f"{article_name}: sequences construction", leave=False, - ) + ), + position=3 ): for construct_pass in tqdm( cfg.construct_single_experiment_passes, From 0968894061c9889defd210db77e77ecf5d4c2656 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:00:07 +0400 Subject: [PATCH 067/102] tqdm progress bar didn't really go well --- extraction/pipeline_pre_quest.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 7ba1294..8d2811c 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -900,7 +900,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): seq_desc: Dict[str, Any] = dict() for param, query, schema in tqdm( - questions_to_schema, desc="Questions to the sequence", position=tqdm_position+1 + questions_to_schema, desc="Questions to the sequence", position=tqdm_position+1, leave=False ): try: chat.add_user_message(query + "\nAnd here is the schema yout answer has to follow:\n```json\n" + json.dumps(schema) + "```\n") @@ -917,7 +917,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): return seq_desc described_sequences: Dict[str, Dict[str, Any]] = dict() - for seq in tqdm(sequences, desc="Found sequences", position=tqdm_position): + for seq in tqdm(sequences, desc="Found sequences", position=tqdm_position, leave=False): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) base_chat_with_sequence.add_user_message( "Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages." @@ -1320,7 +1320,7 @@ def run_project(project_dir: str | Path) -> None: ) ollama_models = client.list() - for model_name in tqdm(cfg.model_names, desc="LLM Models", position=0): + for model_name in tqdm(cfg.model_names, desc="LLM Models", position=0, leave=False): model = outlines.from_ollama(client, model_name) tools = [to_si, parse_oligo, make_measurement] @@ -1342,7 +1342,7 @@ def run_project(project_dir: str | Path) -> None: files = sorted(cfg.input_dir.glob(cfg.article_glob), key=lambda s: str(s).upper()) logger.info(f"Files: {files}") - for art_path in tqdm(files, desc="Articles", position=1): + for art_path in tqdm(files, desc="Articles", position=1, leave=False): article_name = art_path.stem logger.info(f"=== {article_name} : {model_name} ===") article_text = art_path.read_text(encoding="utf-8") @@ -1350,7 +1350,7 @@ def run_project(project_dir: str | Path) -> None: # Run configured pre-passes outputs: Dict[str, Dict[str, Any]] = {} for p in tqdm( - cfg.pre_passes, desc=f"{article_name} pre-passes", position=2 + cfg.pre_passes, desc=f"{article_name} pre-passes", position=2, leave=False ): try: outputs[p.name] = run_single_pass( @@ -1410,9 +1410,8 @@ def run_project(project_dir: str | Path) -> None: tqdm( all_found_sequences, desc=f"{article_name}: sequences construction", - leave=False, - ), - position=3 + leave=False,position=3 + ) ): for construct_pass in tqdm( cfg.construct_single_experiment_passes, From 4fa0f74beb15d959a51e06efcc8913855bac6f5e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:04:04 +0400 Subject: [PATCH 068/102] Now looks like logging works with tqdm just fine --- extraction/pipeline_pre_quest.py | 46 ++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 8d2811c..4ab6ab9 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -164,13 +164,27 @@ def _opt_path(p) -> Optional[Path]: # ────────────────────────────────────────────────────────────────────── +class TqdmLoggingHandler(logging.Handler): + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) + + def _make_logger(log_dir: Path) -> logging.Logger: log_dir.mkdir(parents=True, exist_ok=True) logger = logging.getLogger("pipeline_filedriven") logger.setLevel(logging.INFO) logger.handlers.clear() + - ch = logging.StreamHandler(sys.stdout) + #ch = logging.StreamHandler(sys.stdout) + ch = TqdmLoggingHandler() ch.setLevel(logging.INFO) ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) logger.addHandler(ch) @@ -900,10 +914,18 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): seq_desc: Dict[str, Any] = dict() for param, query, schema in tqdm( - questions_to_schema, desc="Questions to the sequence", position=tqdm_position+1, leave=False + questions_to_schema, + desc="Questions to the sequence", + position=tqdm_position + 1, + leave=False, ): try: - chat.add_user_message(query + "\nAnd here is the schema yout answer has to follow:\n```json\n" + json.dumps(schema) + "```\n") + chat.add_user_message( + query + + "\nAnd here is the schema yout answer has to follow:\n```json\n" + + json.dumps(schema) + + "```\n" + ) response, raw = ask_with_schema( chat_messages=chat, schema=JsonSchema(schema) ) @@ -917,7 +939,9 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): return seq_desc described_sequences: Dict[str, Dict[str, Any]] = dict() - for seq in tqdm(sequences, desc="Found sequences", position=tqdm_position, leave=False): + for seq in tqdm( + sequences, desc="Found sequences", position=tqdm_position, leave=False + ): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) base_chat_with_sequence.add_user_message( "Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages." @@ -1339,7 +1363,9 @@ def run_project(project_dir: str | Path) -> None: logger.info(f"Article glob: {cfg.article_glob}") # Iterate input articles - files = sorted(cfg.input_dir.glob(cfg.article_glob), key=lambda s: str(s).upper()) + files = sorted( + cfg.input_dir.glob(cfg.article_glob), key=lambda s: str(s).upper() + ) logger.info(f"Files: {files}") for art_path in tqdm(files, desc="Articles", position=1, leave=False): @@ -1350,7 +1376,10 @@ def run_project(project_dir: str | Path) -> None: # Run configured pre-passes outputs: Dict[str, Dict[str, Any]] = {} for p in tqdm( - cfg.pre_passes, desc=f"{article_name} pre-passes", position=2, leave=False + cfg.pre_passes, + desc=f"{article_name} pre-passes", + position=2, + leave=False, ): try: outputs[p.name] = run_single_pass( @@ -1410,8 +1439,9 @@ def run_project(project_dir: str | Path) -> None: tqdm( all_found_sequences, desc=f"{article_name}: sequences construction", - leave=False,position=3 - ) + leave=False, + position=3, + ) ): for construct_pass in tqdm( cfg.construct_single_experiment_passes, From 36e7122ec3f17c28b40cb1c775ad7a2ead1404e7 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:05:27 +0400 Subject: [PATCH 069/102] Pretty-print fix --- extraction/pipeline_pre_quest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 4ab6ab9..b1c16f1 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1420,7 +1420,7 @@ def run_project(project_dir: str | Path) -> None: ollama_parameters=cfg.ollama_parameters, logger=logger, model_name=model_name, - tqdm_position=3, + tqdm_position=2, ) stamp = _now_stamp() @@ -1440,7 +1440,7 @@ def run_project(project_dir: str | Path) -> None: all_found_sequences, desc=f"{article_name}: sequences construction", leave=False, - position=3, + position=2, ) ): for construct_pass in tqdm( From 8917f0bf89ed5de66b10a8a1f06a553151ae538e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:11:20 +0400 Subject: [PATCH 070/102] Update regexp for primers and target sequence --- extraction/pipeline_pre_quest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index b1c16f1..e77e611 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -790,7 +790,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), ( @@ -805,13 +805,13 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, "reverse": { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^(5')?([A-Z0-9_()'-]*)[-]?([ACGUTRYSWKMBDHVN0-9()]{5,})[-]?([A-Z0-9_()-]*)(3')?$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, }, }, From 318b5e7237eca813151efda39492adcfc68be501 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:13:52 +0400 Subject: [PATCH 071/102] Update pretty-print --- extraction/pipeline_pre_quest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index e77e611..8d036a2 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -915,7 +915,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): for param, query, schema in tqdm( questions_to_schema, - desc="Questions to the sequence", + desc=f"Questions to the sequence {seq} in {article_stem}", position=tqdm_position + 1, leave=False, ): @@ -940,7 +940,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): described_sequences: Dict[str, Dict[str, Any]] = dict() for seq in tqdm( - sequences, desc="Found sequences", position=tqdm_position, leave=False + sequences, desc=f"Found sequences in {article_stem}", position=tqdm_position, leave=False ): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) base_chat_with_sequence.add_user_message( @@ -1368,7 +1368,7 @@ def run_project(project_dir: str | Path) -> None: ) logger.info(f"Files: {files}") - for art_path in tqdm(files, desc="Articles", position=1, leave=False): + for art_path in tqdm(files, desc=f"Articles for model {model_name}", position=1, leave=False): article_name = art_path.stem logger.info(f"=== {article_name} : {model_name} ===") article_text = art_path.read_text(encoding="utf-8") From 7747827c2154aa8206375430c7b6ea3cf78d4a69 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Tue, 7 Oct 2025 19:14:38 +0400 Subject: [PATCH 072/102] Lower context size and num predict --- extraction/config/pipeline.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index d90ca92..f87b14a 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -9,8 +9,8 @@ "myaniu/qwen2.5-1m:14b" ], "ollama_parameters": { - "num_ctx": 131072, - "num_predict": 131072, + "num_ctx": 40960, + "num_predict": 8192, "temperature": 0.2, "seed": 52 }, From e6e635cd9310596dc3cdd2f5c192171dc1459835 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 02:43:46 +0400 Subject: [PATCH 073/102] Trying new approach with insert into DB --- extraction/config/pipeline.json | 18 +- extraction/hyb_db.py | 276 ++++++++++++++++++++++++++++++- extraction/pipeline_pre_quest.py | 138 +++++++++------- 3 files changed, 363 insertions(+), 69 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index f87b14a..3f6d2b9 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,12 +1,7 @@ { "model_names": [ "myaniu/qwen2.5-1m:7b", - "deepseek-r1:1.5b", - "qwen2.5-coder:3b", - "deepseek-r1:7b-qwen-distill-q4_K_M", - "llama3.1:latest", - "qwen3:4b", - "myaniu/qwen2.5-1m:14b" + "deepseek-r1:1.5b" ], "ollama_parameters": { "num_ctx": 40960, @@ -15,9 +10,10 @@ "seed": 52 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 60, - "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", - "out_dir": "outlines_output_pre", + "timeout_s": 120, + "_input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", + "input_dir": "input/md", + "out_dir": "outlines_output_db", "full_schema_path": "schema/json/article.json", "common_prompt_path": "passes/common.txt", "db_path": "outlines_output/massive.sqlite", @@ -43,7 +39,9 @@ "schema": "passes/A_core/schema.json", "prompt": "passes/A_core/prompt.txt", "timeout": 60 - }, + } + ], + "ignored_passes": [ { "name": "B_index", "schema": "passes/B_index/schema.json", diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index de3b0c9..7ba9b5b 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -9,7 +9,7 @@ from tqdm import tqdm #from __future__ import annotations import sqlite3 -from contextlib import contextmanager +from contextlib import contextmanager, closing from datetime import datetime, timezone from loguru import logger from ollama import chat, ChatResponse @@ -873,3 +873,277 @@ def _to_int_bool(val: Optional[bool]) -> Optional[int]: if val is None: return None return 1 if bool(val) else 0 + +# ────────────────────────────────────────────────────────────────────── +# Sequence-descriptors DB (no collision; separate "seqdesc_*" namespace) +# ────────────────────────────────────────────────────────────────────── + +def _extract_doi_from_text(text: str) -> Optional[str]: + """Heuristic DOI extractor from article text (fallback).""" + if not text: + return None + m = re.search(r"\b10\.\d{4,9}/[^\s\"'<>]+", text, flags=re.I) + return m.group(0).rstrip(".,);]") if m else None + +def _ensure_seqdesc_schema(conn: sqlite3.Connection) -> None: + """Create the seqdesc_* schema if it does not exist.""" + conn.execute("PRAGMA foreign_keys = ON;") + # Runs table + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + created_at TEXT NOT NULL, + model_name TEXT NOT NULL, + article_name TEXT NOT NULL, + doi TEXT, + source_path TEXT, + raw_json TEXT NOT NULL + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_article ON seqdesc_runs(article_name);") + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_doi ON seqdesc_runs(doi);") + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_model ON seqdesc_runs(model_name);") + + # Sequences table (one row per sequence key in the run) + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_sequences ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES seqdesc_runs(id) ON DELETE CASCADE, + sequence_key TEXT NOT NULL, -- the dict key (probe string as found) + is_seq INTEGER, -- NULL/0/1 + sequence_full TEXT, + sequence_normalized TEXT, + sequence_expanded TEXT, + sequence_backbone TEXT, + sequence_backbone_expanded TEXT, + fluorophore TEXT, + quencher TEXT, + target_raw TEXT, + target_normalized TEXT, + primers_forward TEXT, + primers_reverse TEXT, + pH REAL, + annealing_raw TEXT, + T_value REAL, + T_unit TEXT, + Tris_value REAL, + Tris_unit TEXT, + Na_value REAL, + Na_unit TEXT, + K_value REAL, + K_unit TEXT, + Mg_value REAL, + Mg_unit TEXT, + DMSO_value REAL, + DMSO_unit TEXT, + outcome INTEGER, -- NULL/0/1 + raw_json TEXT NOT NULL + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_sequences_run ON seqdesc_sequences(run_id);") + + # Modifications table (0..N per sequence) + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_modifications ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + sequence_id INTEGER NOT NULL REFERENCES seqdesc_sequences(id) ON DELETE CASCADE, + modification_position INTEGER, + modification_type TEXT, + modification_description TEXT + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_mods_seq ON seqdesc_modifications(sequence_id);") + + # Helpful views (namespaced) + conn.execute(""" + CREATE VIEW IF NOT EXISTS seqdesc_v_sequences AS + SELECT + r.id AS run_id, + r.created_at AS run_created_at, + r.model_name AS model_name, + r.article_name AS article_name, + r.doi AS doi, + s.* + FROM seqdesc_sequences s + JOIN seqdesc_runs r ON r.id = s.run_id; + """) + conn.execute(""" + CREATE VIEW IF NOT EXISTS seqdesc_v_modifications AS + SELECT + s.run_id, + s.id AS sequence_id, + s.sequence_key, + m.modification_position, + m.modification_type, + m.modification_description + FROM seqdesc_modifications m + JOIN seqdesc_sequences s ON s.id = m.sequence_id; + """) + +def _coerce_bool_to_int(x: Any) -> Optional[int]: + if x is None: + return None + if isinstance(x, bool): + return 1 if x else 0 + # sometimes LLMs send "true"/"false" + xs = str(x).strip().lower() + if xs in {"true", "1", "yes"}: + return 1 + if xs in {"false", "0", "no"}: + return 0 + return None + +def _coerce_float(x: Any) -> Optional[float]: + try: + return float(x) if x is not None else None + except Exception: + return None + +def _extract_measure(obj: Any) -> Tuple[Optional[float], Optional[str]]: + """obj like {"value": 50, "unit": "mM"} or None -> (50.0, 'mM')""" + if isinstance(obj, dict): + return _coerce_float(obj.get("value")), (obj.get("unit") if obj.get("unit") is not None else None) + return None, None + +def insert_seqdesc_object( + db_path: Path | str, + *, + article_name: str, + doi: Optional[str], + model_name: str, + sequence_descriptors: Dict[str, Any], + source_path: Optional[Path] = None, +) -> int: + """Insert one 'run' of sequence descriptors and return run_id. + + The payload shape: + { + "": { + "is_seq": bool|None, + "sequence_full": str|None, + ... + "modifications": [{"modification_position": int, "modification_type": str, "modification_description": str}, ...], + "primers": {"forward": str|None, "reverse": str|None}, + "T": {"value": float, "unit": str}|None, + ... + }, + ... + } + """ + created_at = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + raw_json = json.dumps(sequence_descriptors, ensure_ascii=False) + + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + + with closing(sqlite3.connect(str(db_path))) as conn: + conn.execute("PRAGMA journal_mode = WAL;") + _ensure_seqdesc_schema(conn) + + with conn: # transaction + cur = conn.cursor() + cur.execute( + """ + INSERT INTO seqdesc_runs(created_at, model_name, article_name, doi, source_path, raw_json) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + created_at, + model_name, + article_name, + doi, + str(source_path) if source_path else None, + raw_json, + ), + ) + run_id = cur.lastrowid + + for seq_key, payload in (sequence_descriptors or {}).items(): + # For very sparse entries, payload can be {} — guard everything + payload = payload or {} + + is_seq = _coerce_bool_to_int(payload.get("is_seq")) + seq_full = payload.get("sequence_full") + seq_norm = payload.get("sequence_normalized") + seq_exp = payload.get("sequence_expanded") + seq_bb = payload.get("sequence_backbone") + seq_bb_exp = payload.get("sequence_backbone_expanded") + fluor = payload.get("fluorophore") + quen = payload.get("quencher") + target_raw = payload.get("target_raw") + target_norm = payload.get("target_normalized") + + primers = payload.get("primers") or {} + primers_forward = primers.get("forward") + primers_reverse = primers.get("reverse") + + pH_val = _coerce_float(payload.get("pH")) + anneal_raw = payload.get("annealing_raw") + + T_val, T_unit = _extract_measure(payload.get("T")) + Tris_val, Tris_unit = _extract_measure(payload.get("Tris")) + Na_val, Na_unit = _extract_measure(payload.get("Na")) + K_val, K_unit = _extract_measure(payload.get("K")) + Mg_val, Mg_unit = _extract_measure(payload.get("Mg")) + DMSO_val, DMSO_unit = _extract_measure(payload.get("DMSO")) + + outcome = _coerce_bool_to_int(payload.get("outcome")) + + cur.execute( + """ + INSERT INTO seqdesc_sequences( + run_id, sequence_key, is_seq, + sequence_full, sequence_normalized, sequence_expanded, + sequence_backbone, sequence_backbone_expanded, + fluorophore, quencher, + target_raw, target_normalized, + primers_forward, primers_reverse, + pH, annealing_raw, + T_value, T_unit, + Tris_value, Tris_unit, + Na_value, Na_unit, + K_value, K_unit, + Mg_value, Mg_unit, + DMSO_value, DMSO_unit, + outcome, raw_json + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + run_id, seq_key, is_seq, + seq_full, seq_norm, seq_exp, + seq_bb, seq_bb_exp, + fluor, quen, + target_raw, target_norm, + primers_forward, primers_reverse, + pH_val, anneal_raw, + T_val, T_unit, + Tris_val, Tris_unit, + Na_val, Na_unit, + K_val, K_unit, + Mg_val, Mg_unit, + DMSO_val, DMSO_unit, + outcome, json.dumps(payload, ensure_ascii=False), + ), + ) + sequence_id = cur.lastrowid + + # Modifications (array of objects) + for m in payload.get("modifications") or []: + if not isinstance(m, dict): + continue + cur.execute( + """ + INSERT INTO seqdesc_modifications( + sequence_id, modification_position, modification_type, modification_description + ) VALUES (?,?,?,?) + """, + ( + sequence_id, + m.get("modification_position"), + m.get("modification_type"), + m.get("modification_description"), + ), + ) + + return run_id diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 8d036a2..3523aba 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1467,64 +1467,86 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - # for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False): - # try: - # outputs[p.name] = run_single_pass( - # model=model, - # article_text=article_text, - # pass_cfg=p, - # out_base=out_base, - # article_stem=article_name, - # tools=tools, - # logger=logger, - # ollama_parameters=cfg.ollama_parameters, - # model_name=model_name, - # ) - # except Exception: - # logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") - - # # Stitch only if the expected pass names are present - # try: - # A = outputs.get("A_core", {}) - # B = outputs.get("B_index", {}) - # # C = outputs.get("C_sequences", {}) - # C = aggregate_c_outputs(outputs) - # D = outputs.get("D_parameters", {}) - # E = outputs.get("E_outcomes", {}) - # F = outputs.get("F_pairings", {}) - # full_obj = stitch_full(A, B, C, D, E, F) - - # # Final validation - # if full_validator: - # errs = sorted(full_validator.iter_errors(full_obj), key=lambda e: e.path) - # if errs: - # logger.error(f"[FULL] validation errors for {article_name} : {model_name}:\n" + "\n".join(str(e) for e in errs)) - # else: - # logger.info(f"[FULL] validation OK for {article_name} : {model_name}") - - # # Save full object (timestamped) - # stamp = _now_stamp() - # full_dir = out_base / "json_full" - # full_dir.mkdir(parents=True, exist_ok=True) - # full_path = full_dir / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" - # full_path.write_text(json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8") - # logger.info(f"[FULL] wrote {full_path.name} {article_name} : {model_name}") - - # # Optional DB insert - # if cfg.db_path: - # try: - # from hyb_db import insert_article_object # your earlier module - # run_id = insert_article_object( - # db_path=str(cfg.db_path), - # article_obj=full_obj, - # model_name=model_name, - # article_name=article_name, - # ) - # logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") - # except Exception: - # logger.exception("[DB] insertion failed") - # except Exception: - # logger.exception(f"[FULL] stitching failed for {article_name} : {model_name}") + for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False, position=2): + try: + outputs[p.name] = run_single_pass( + model=model, + article_text=article_text, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) + except Exception: + logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") + + # Stitch only if the expected pass names are present + try: + A = outputs.get("A_core", {}) + B = outputs.get("B_index", {}) + # C = outputs.get("C_sequences", {}) + C = aggregate_c_outputs(outputs) + D = outputs.get("D_parameters", {}) + E = outputs.get("E_outcomes", {}) + F = outputs.get("F_pairings", {}) + full_obj = stitch_full(A, B, C, D, E, F) + + # Final validation + if full_validator: + errs = sorted(full_validator.iter_errors(full_obj), key=lambda e: e.path) + if errs: + logger.error(f"[FULL] validation errors for {article_name} : {model_name}:\n" + "\n".join(str(e) for e in errs)) + else: + logger.info(f"[FULL] validation OK for {article_name} : {model_name}") + + # Save full object (timestamped) + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_path = full_dir / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" + full_path.write_text(json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8") + logger.info(f"[FULL] wrote {full_path.name} {article_name} : {model_name}") + except Exception: + logger.exception(f"[FULL] stitching failed for {article_name} : {model_name}") + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_article_object # your earlier module + run_id = insert_article_object( + db_path=str(cfg.db_path), + article_obj=full_obj, + model_name=model_name, + article_name=article_name, + ) + logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") + except Exception: + logger.exception("[DB] insertion failed") + except Exception: + logger.exception(f"[DB INSERT FULL] stitching failed for {article_name} : {model_name}") + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=sequence_descriptors + source_path=art_path, + ) + logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") + except Exception: + logger.exception("[DB] insertion failed") + except Exception: + logger.exception(f"[DB INSERT SEQDESC] stitching failed for {article_name} : {model_name}") # Optional CLI hook (project_dir arg) From 5629df1917accda58be052f4233452e9fb4b4c43 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 04:04:11 +0400 Subject: [PATCH 074/102] Faster generation for chat mode --- extraction/config/pipeline.json | 18 +- extraction/hyb_db.py | 4 +- extraction/pipeline_pre_quest.py | 1015 ++++++++++++++++++++++++++++-- 3 files changed, 982 insertions(+), 55 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 3f6d2b9..6a7c577 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,22 +1,22 @@ { "model_names": [ - "myaniu/qwen2.5-1m:7b", - "deepseek-r1:1.5b" + "phi4:14b", + "myaniu/qwen2.5-1m:7b" ], "ollama_parameters": { - "num_ctx": 40960, - "num_predict": 8192, - "temperature": 0.2, - "seed": 52 + "num_ctx": 65536, + "num_predict": 65536, + "temperature": 0.1, + "seed": 42 }, "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 120, - "_input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", - "input_dir": "input/md", + "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", + "_input_dir": "input/md", "out_dir": "outlines_output_db", "full_schema_path": "schema/json/article.json", "common_prompt_path": "passes/common.txt", - "db_path": "outlines_output/massive.sqlite", + "db_path": "outlines_output_db/massive.sqlite", "article_glob": "**/*.md", "pre_passes": [ { diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index 7ba9b5b..ee03fc1 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -1011,7 +1011,7 @@ def insert_seqdesc_object( article_name: str, doi: Optional[str], model_name: str, - sequence_descriptors: Dict[str, Any], + sequence_descriptors: List[Tuple[str, Dict[str, Any]]], source_path: Optional[Path] = None, ) -> int: """Insert one 'run' of sequence descriptors and return run_id. @@ -1058,7 +1058,7 @@ def insert_seqdesc_object( ) run_id = cur.lastrowid - for seq_key, payload in (sequence_descriptors or {}).items(): + for seq_key, payload in (sequence_descriptors or []): # For very sparse entries, payload can be {} — guard everything payload = payload or {} diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 3523aba..2b18c82 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -30,7 +30,7 @@ from dataclasses import dataclass from datetime import datetime from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Literal, Optional, Set, Tuple import ollama import outlines @@ -181,9 +181,8 @@ def _make_logger(log_dir: Path) -> logging.Logger: logger = logging.getLogger("pipeline_filedriven") logger.setLevel(logging.INFO) logger.handlers.clear() - - #ch = logging.StreamHandler(sys.stdout) + # ch = logging.StreamHandler(sys.stdout) ch = TqdmLoggingHandler() ch.setLevel(logging.INFO) ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) @@ -335,6 +334,746 @@ def repair_json(text: str) -> str: return candidate +# ────────────────────────────────────────────────────────────────────── +# Chat helpers +# ────────────────────────────────────────────────────────────────────── + +# ────────────────────────────────────────────────────────────────────── +# Fast stateful chat for structured JSON answers (Ollama context reuse) +# ────────────────────────────────────────────────────────────────────── +from typing import Callable + + +class OllamaJSONChat: + """ + Keeps a persistent Ollama 'context' using the generate() API. + We seed once with a system prompt (includes article snippet & sequence), + then for each question we call generate() with only the new instruction + and pass the returned `context` back in. + """ + + def __init__( + self, + client: ollama.Client, + model_name: str, + system_prompt: str, + *, + options: Optional[Dict[str, Any]] = None, + keep_alive: str = "2m", + logger: Optional[logging.Logger] = None, + use_schema_format: bool = True, + ) -> None: + self.client = client + self.model_name = model_name + self.options = options or {} + self.keep_alive = keep_alive + self.logger = logger or logging.getLogger("OllamaJSONChat") + self.context: Optional[List[int]] = None + + # Bootstrap the KV cache with the system prompt once. + # We don't care about the text reply here; we only keep the returned context. + boot = self.client.generate( + model=self.model_name, + prompt=system_prompt, + options=self.options, + keep_alive=self.keep_alive, + ) + self.context = boot.get("context") + + # Detect JSON schema support (best effort: try once without touching our context). + self._schema_supported = False + if use_schema_format: + try: + _ = self.client.generate( + model=self.model_name, + prompt="Return {}", + options=self.options, + keep_alive=self.keep_alive, + # IMPORTANT: do not pass our current context here, so we don't pollute it + format={"type": "json", "schema": {"type": "object"}}, + ) + self._schema_supported = True + except Exception: + self._schema_supported = False + + def ask_json( + self, + user_prompt: str, + *, + schema: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Ask a single question. Only the new instruction is sent; the previous + state is carried via `context`. + Returns the raw text from `response`. + """ + kwargs = dict( + model=self.model_name, + prompt=user_prompt, + options=self.options, + keep_alive=self.keep_alive, + context=self.context, # ← this is supported by generate(), not chat() + ) + if schema is not None and self._schema_supported: + kwargs["format"] = {"type": "json", "schema": schema} + else: + kwargs["format"] = "json" + + res = self.client.generate(**kwargs) + # Persist updated KV context + self.context = res.get("context", self.context) + return res.get("response", "") # generate() returns 'response' + + + +def extract_relevant_snippet(article_text: str, seq: str, *, window: int = 1200) -> str: + """ + Find a case-insensitive hit of 'seq' in article_text and return a small window + around it. If not found, return the first ~window*2 characters as a fallback. + This dramatically reduces re-tokenization cost per turn. + """ + if not article_text: + return "" + # normalize simple whitespace + case-insensitive search + text = article_text + seq_norm = re.sub(r"\s+", "", seq, flags=re.S).lower() + text_compact = re.sub(r"\s+", "", text, flags=re.S).lower() + + idx = text_compact.find(seq_norm) if seq_norm else -1 + if idx == -1: + # fallback: just take a chunk from the start + return text[: window * 2] + + # Map back to original indices approximately + # We walk original text accumulating compact length until we cross idx + comp_len = 0 + start_raw = 0 + for i, ch in enumerate(text): + if not ch.isspace(): + comp_len += 1 + if comp_len >= max(0, idx - 5): # a little headroom + start_raw = i + break + # Now center a window around start_raw + lo = max(0, start_raw - window) + hi = min(len(text), start_raw + window) + return text[lo:hi] + + +def run_query_model_speed_up( + model: Any, # kept for signature compatibility; not used here + article_text: str, + sequences: List[str], + out_base: Path, + article_stem: str, + common_prompt_path: Path, + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, + tqdm_position: int = 0, + client: Optional[ollama.Client] = None, # NEW: pass the ollama client here + chat_prompts: Literal["my", "optimized"] = "my", +) -> List[Tuple[str, Any]]: + """ + Faster version: use Ollama chat 'context' to avoid re-sending the whole chat every turn, + and seed each sequence with a small snippet instead of the full article. + """ + if client is None: + raise ValueError( + "run_query_model requires an ollama.Client via the 'client' argument." + ) + + pass_name = "query_chat" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + prompt = common_prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_log_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log.json" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_name}:{model_name}] generating (fast chat mode)…") + + # Define your Q&A list once (same as your original) but as Python dicts for direct JSON schema passing. + # NOTE: We’ll construct outlines.JsonSchema only if you still want stricter client-side validation. + questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] + if chat_prompts == "optimized": + questions_to_schema = [ + ( + "is_seq", + "Check the entire snippet. Is the provided sequence (or that exact string) presented as a hybridization probe in this article snippet? Return true only if it's a probe (or its explicit part).", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Return the full probe string in IUPAC-normalized format, including 5'/3' and labels if present (fluorophore first, quencher last). Return null if not applicable.", + {"type": ["string", "null"], "minLength": 5, "maxLength": 150}, + ), + ( + "sequence_normalized", + "Return the same probe with explicit 5' and 3' bounds, e.g., 5'-FAM-ACGT...-BHQ1-3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + }, + ), + ( + "sequence_expanded", + "Return the expanded IUPAC probe (no parentheses in backbone), with 5'/3' bounds and labels if present. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", + }, + ), + ( + "sequence_backbone", + "Return backbone only (no labels/mods), 5'…3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", + }, + ), + ( + "sequence_backbone_expanded", + "Return backbone expanded only (no labels/mods), 5'…3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), + ( + "fluorophore", + "Return fluorophore (uppercase, alnum, apostrophe ok), or null.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Return quencher (uppercase, alnum, apostrophe ok), or null.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Return array of modifications with 5'→3' positions; [] if none.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": {"type": "integer", "minimum": 1}, + "modification_type": { + "type": "string", + "minLength": 1, + "maxLength": 100, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, + }, + }, + ), + ( + "target_raw", + "Describe the intended target for this probe (gene/region/context).", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "If article prints the exact target sequence, return it in 5'…3' bounds; else null.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + }, + ), + ( + "primers", + "Return primer sequences in IUPAC normalized 5'…3' bounds; use null for missing.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + }, + }, + }, + ), + ("pH", "Return pH if stated; else null.", {"type": ["number", "null"]}), + ( + "annealing_raw", + "Return the raw annealing description string found; if absent, explain why in one sentence.", + {"type": "string", "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Return melting temperature as {value, unit} (e.g., 58 °C), or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Return Tris as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Return Na as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Return K as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Return Mg as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Return DMSO as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Return true if article explicitly says this probe successfully hybridized, false if explicitly failed, or null if not stated.", + {"type": ["boolean", "null"]}, + ), + ] + elif chat_prompts == "my": + questions_to_schema = [ + ( + "is_seq", + "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text? Put true here if and only if this sequence is being described and presented as a hybridization probe. If that's a random abbreviation or nucleotide-looking string which is not a hybridization probe or otherwise not a hybridization probe, put false here.", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Provide this sequence fully as a probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + }, + ), + ( + "sequence_normalized", + "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "sequence_expanded", + "Provide this probe sequence in expanded IUPAC format (with all repeats expanded and no parentheses in the probe sequence backbone body): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9_'-]*-)?([a-zA-Z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9']*?)(-[a-zA-Z0-9_'-]*)?-3'$", + }, + ), + ( + "sequence_backbone", + "Now provide only the probe sequence body from 5' to 3', without any fluorophores, modifications and quenchers. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", + }, + ), + ( + "sequence_backbone_expanded", + "Now provide only the expanded probe sequence body from 5' to 3' with all repeats expanded, without any fluorophores, modifications and quenchers. Use capital Latin letters, digits, dashes and apostrophy. Only the expanded backbone of probe sequence body. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), + ( + "fluorophore", + "Provide the fluorophore of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Provide the quencher of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Now provide the modifications of the probe sequence as an array, where each element is a modification and its position in 5'-3' direction. Use Latin letters, digits and dashes, you may also use parentheses and apostrophy. Provide an empty array if not present in the article text.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": { + "type": "integer", + "minimum": 1, + }, + "modification_type": { + "type": "string", + "maxLength": 100, + "minLength": 1, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, + }, + }, + ), + ( + "target_raw", + "Describe the target to which this probe was designed to hybridize.", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "Now provide the target sequence to which this probe should hybridize, from 5' to 3'. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable or if the exact sequence is not present in the article text.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "primers", + "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + }, + }, + ), + ( + "pH", + "Describe the pH in this experiment. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + {"type": ["number", "null"]}, + ), + ( + "annealing_raw", + "Describe the annealing in this experiment. Provide the raw description string. If that's can't be inferred from the whole article text, explain why.", + {"type": ["string"], "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Describe the melting temperature in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Describe the amount of Tris in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Describe the amount of Na (Sodium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Describe the amount of K (Potassium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Describe the amount of Mg (Magnesium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Describe the amount of DMSO in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Describe the outcome of this hybridization experiment based on the article text. Put true in case of successful hybridization of this probe to target, put false in case of unsuccessful and put null if this information is not present in the article.", + {"type": ["boolean", "null"]}, + ), + ] + else: + raise ValueError("Chat prompts must either be 'my' or 'optimized'") + + answers_log: List[Dict[str, Any]] = [] + described_sequences: List[Tuple[str, Dict[str, Any]]] = [] + + try: + for seq in tqdm( + sequences, + desc=f"Found sequences in {article_stem}", + position=tqdm_position, + leave=False, + ): + # Slice a small, relevant article window for this sequence + snippet = extract_relevant_snippet(article_text, seq, window=1400) + + # Build a short system prompt (article is only injected ONCE here) + sys_prompt = ( + prompt + + "\n\nYou will answer a series of short JSON-only questions about a SINGLE candidate probe sequence.\n" + + "You MUST base answers ONLY on this article snippet:\n\n" + + snippet + + "\n\n" + + "Candidate probe:\n\n" + + seq + + "\n\n" + + "Return strictly JSON for each question — no extra commentary." + ) + + # Create a fresh stateful session for THIS sequence (keeps context across questions) + chat = OllamaJSONChat( + client=client, + model_name=model_name, + system_prompt=sys_prompt, + options=ollama_parameters, + keep_alive="2m", + logger=logger, + use_schema_format=True, # will auto-downgrade if not supported + ) + + seq_desc: Dict[str, Any] = {} + for param, query, schema in tqdm( + questions_to_schema, + desc=f"Questions for {seq[:24]}…", + position=tqdm_position + 1, + leave=False, + ): + try: + user_msg = ( + query + + "\nReturn ONLY valid JSON matching this schema:\n" + + json.dumps(schema, ensure_ascii=False) + ) + raw_json = chat.ask_json(user_msg, schema=schema) + # Best-effort repair + parse + fixed = repair_json(raw_json) + obj = json.loads(fixed) + + # Persist logs + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> {query}\n< {raw_json}\n\n") + + answers_log.append( + {"sequence": seq, "param": param, "response": obj} + ) + seq_desc[param] = obj + except Exception as e: + logger.exception( + f"Exception on sequence {seq} during question '{param}'" + ) + with open(err_log_path, mode="at", encoding="utf-8") as ef: + ef.write(f"[{seq}] {param} error: {repr(e)}\n") + + described_sequences.append((seq, seq_desc)) + + finally: + json_log_path.write_text( + json.dumps(answers_log, indent=2, ensure_ascii=False), encoding="utf-8" + ) + json_out_path.write_text( + json.dumps({s: d for (s, d) in described_sequences}, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + return described_sequences + + # ────────────────────────────────────────────────────────────────────── # Outlines runner # ────────────────────────────────────────────────────────────────────── @@ -587,7 +1326,7 @@ def run_query_model( ollama_parameters: Dict[str, Any], model_name: str, tqdm_position: int = 0, -) -> Dict[str, Any]: +) -> List[Tuple[str, Any]]: """Run one pass (schema+prompt from files), save raw+json+log, return object.""" pass_name = "query_chat" txt_dir = out_base / "txt" @@ -938,9 +1677,12 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): ) return seq_desc - described_sequences: Dict[str, Dict[str, Any]] = dict() + described_sequences: List[Tuple[str, Dict[str, Any]]] = [] for seq in tqdm( - sequences, desc=f"Found sequences in {article_stem}", position=tqdm_position, leave=False + sequences, + desc=f"Found sequences in {article_stem}", + position=tqdm_position, + leave=False, ): base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) base_chat_with_sequence.add_user_message( @@ -954,7 +1696,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): sequence_descriptor = parse_sequence( seq, base_chat=base_chat_with_sequence ) - described_sequences[seq] = sequence_descriptor + described_sequences.append(seq, sequence_descriptor) answers.append( {"sequence": seq, "sequence_descriptor": sequence_descriptor} ) @@ -974,7 +1716,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): json.dumps(described_sequences, indent=2, ensure_ascii=False), encoding="utf-8", ) - return described_sequences + return described_sequences # ────────────────────────────────────────────────────────────────────── @@ -1368,7 +2110,9 @@ def run_project(project_dir: str | Path) -> None: ) logger.info(f"Files: {files}") - for art_path in tqdm(files, desc=f"Articles for model {model_name}", position=1, leave=False): + for art_path in tqdm( + files, desc=f"Articles for model {model_name}", position=1, leave=False + ): article_name = art_path.stem logger.info(f"=== {article_name} : {model_name} ===") article_text = art_path.read_text(encoding="utf-8") @@ -1398,19 +2142,146 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) + strict_sequences: Set[str] = set(map(lambda s: s.upper(), outputs.get("SeqPrompt_strict", []))) + nonstrict_sequences: Set[str] = set(map(lambda s: s.upper(), outputs.get("SeqPrompt", []))) + + all_found_sequences = list( sorted( - set( - set(outputs.get("SeqPrompt_strict", [])).union( - outputs.get("SeqPrompt", []) - ) - ) + strict_sequences.union(nonstrict_sequences), + key=lambda s: (0 if s in strict_sequences else 1), ) ) all_found_sequences_str = ", ".join(all_found_sequences) logger.info("Pre-passes done, found sequences: " + all_found_sequences_str) - sequence_descriptors = run_query_model( + for p in tqdm( + cfg.passes, desc=f"{article_name} passes", leave=False, position=2 + ): + try: + outputs[p.name] = run_single_pass( + model=model, + article_text=article_text, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + ) + except Exception: + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) + + + optimized_sequence_descriptors = run_query_model_speed_up( + model=model, # not used in the fast version but kept for signature compatibility + article_text=article_text, + sequences=all_found_sequences, + out_base=out_base, + article_stem=article_name, + common_prompt_path=cfg.common_prompt_path, + ollama_parameters=cfg.ollama_parameters, + logger=logger, + model_name=model_name, + tqdm_position=2, + client=client, # <-- important: pass the raw ollama.Client + chat_prompts="optimized" + ) + + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OPTIM__{stamp}.json" + ) + full_seq_desc_path.write_text( + json.dumps(optimized_sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=optimized_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC OPTIM] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC OPTIM] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC OPTIM] stitching failed for {article_name} : {model_name}" + ) + + + my_sequence_descriptors = run_query_model_speed_up( + model=model, # not used in the fast version but kept for signature compatibility + article_text=article_text, + sequences=all_found_sequences, + out_base=out_base, + article_stem=article_name, + common_prompt_path=cfg.common_prompt_path, + ollama_parameters=cfg.ollama_parameters, + logger=logger, + model_name=model_name, + tqdm_position=2, + client=client, # <-- important: pass the raw ollama.Client + chat_prompts="my" + ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-MY__{stamp}.json" + ) + full_seq_desc_path.write_text( + json.dumps(my_sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=my_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC MY] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC MY] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC MY] stitching failed for {article_name} : {model_name}" + ) + + old_sequence_descriptors = run_query_model( model=model, article_text=article_text, sequences=all_found_sequences, @@ -1423,6 +2294,49 @@ def run_project(project_dir: str | Path) -> None: tqdm_position=2, ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OLD__{stamp}.json" + ) + full_seq_desc_path.write_text( + json.dumps(old_sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=old_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC OLD] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC OLD] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC OLD] stitching failed for {article_name} : {model_name}" + ) + + + sequence_descriptors: List[Tuple[str, Dict[str, Any]]] = [] + sequence_descriptors.extend(optimized_sequence_descriptors) + sequence_descriptors.extend(my_sequence_descriptors) + sequence_descriptors.extend(old_sequence_descriptors) + stamp = _now_stamp() full_dir = out_base / "json_full" full_dir.mkdir(parents=True, exist_ok=True) @@ -1435,6 +2349,7 @@ def run_project(project_dir: str | Path) -> None: encoding="utf-8", ) + for i, seq in enumerate( tqdm( all_found_sequences, @@ -1467,21 +2382,7 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - for p in tqdm(cfg.passes, desc=f"{article_name} passes", leave=False, position=2): - try: - outputs[p.name] = run_single_pass( - model=model, - article_text=article_text, - pass_cfg=p, - out_base=out_base, - article_stem=article_name, - tools=tools, - logger=logger, - ollama_parameters=cfg.ollama_parameters, - model_name=model_name, - ) - except Exception: - logger.exception(f"Pass failed: {p.name} : {article_name} : {model_name}") + # Stitch only if the expected pass names are present try: @@ -1496,57 +2397,83 @@ def run_project(project_dir: str | Path) -> None: # Final validation if full_validator: - errs = sorted(full_validator.iter_errors(full_obj), key=lambda e: e.path) + errs = sorted( + full_validator.iter_errors(full_obj), key=lambda e: e.path + ) if errs: - logger.error(f"[FULL] validation errors for {article_name} : {model_name}:\n" + "\n".join(str(e) for e in errs)) + logger.error( + f"[FULL] validation errors for {article_name} : {model_name}:\n" + + "\n".join(str(e) for e in errs) + ) else: - logger.info(f"[FULL] validation OK for {article_name} : {model_name}") + logger.info( + f"[FULL] validation OK for {article_name} : {model_name}" + ) # Save full object (timestamped) stamp = _now_stamp() full_dir = out_base / "json_full" full_dir.mkdir(parents=True, exist_ok=True) - full_path = full_dir / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" - full_path.write_text(json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8") - logger.info(f"[FULL] wrote {full_path.name} {article_name} : {model_name}") + full_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" + ) + full_path.write_text( + json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) + logger.info( + f"[FULL] wrote {full_path.name} {article_name} : {model_name}" + ) except Exception: - logger.exception(f"[FULL] stitching failed for {article_name} : {model_name}") + logger.exception( + f"[FULL] stitching failed for {article_name} : {model_name}" + ) try: # Optional DB insert if cfg.db_path: try: from hyb_db import insert_article_object # your earlier module + run_id = insert_article_object( db_path=str(cfg.db_path), article_obj=full_obj, model_name=model_name, article_name=article_name, ) - logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") + logger.info( + f"[DB INSERT FULL] inserted run_id={run_id} for {article_name} : {model_name}" + ) except Exception: - logger.exception("[DB] insertion failed") + logger.exception("[DB INSERT FULL] insertion failed") except Exception: - logger.exception(f"[DB INSERT FULL] stitching failed for {article_name} : {model_name}") + logger.exception( + f"[DB INSERT FULL] stitching failed for {article_name} : {model_name}" + ) try: # Optional DB insert if cfg.db_path: try: from hyb_db import insert_seqdesc_object # your earlier module + run_id = insert_seqdesc_object( db_path=str(cfg.db_path), article_name=article_name, doi=outputs.get("A_core", {}).get("doi", None), model_name=model_name, - sequence_descriptors=sequence_descriptors + sequence_descriptors=sequence_descriptors, source_path=art_path, ) - logger.info(f"[DB] inserted run_id={run_id} for {article_name} : {model_name}") + logger.info( + f"[DB INSERT SEQDESC] inserted run_id={run_id} for {article_name} : {model_name}" + ) except Exception: - logger.exception("[DB] insertion failed") + logger.exception("[DB INSERT SEQDESC] insertion failed") except Exception: - logger.exception(f"[DB INSERT SEQDESC] stitching failed for {article_name} : {model_name}") + logger.exception( + f"[DB INSERT SEQDESC] stitching failed for {article_name} : {model_name}" + ) # Optional CLI hook (project_dir arg) From 7316a53ffc2a4cc61799041a6cafea169b58d9fc Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 04:24:40 +0400 Subject: [PATCH 075/102] Should have added format fixer --- extraction/pipeline_pre_quest.py | 54 +++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 2b18c82..632975b 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -425,7 +425,6 @@ def ask_json( return res.get("response", "") # generate() returns 'response' - def extract_relevant_snippet(article_text: str, seq: str, *, window: int = 1200) -> str: """ Find a case-insensitive hit of 'seq' in article_text and return a small window @@ -1053,7 +1052,28 @@ def run_query_model_speed_up( answers_log.append( {"sequence": seq, "param": param, "response": obj} ) - seq_desc[param] = obj + + fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\n```.", + format_fixed_raw_json = think_generate( + model=model, + model_input=fix_query, + logger=logger, + output_type=JsonSchema(schema=schema), + think=True, + ) + + # Persist logs + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> {fix_query}\n< {format_fixed_raw_json}\n\n") + + format_fixed = repair_json(format_fixed_raw_json) + fixed_obj = json.loads(format_fixed) + + answers_log.append( + {"sequence": seq, "param": param, "response": fixed_obj} + ) + + seq_desc[param] = fixed_obj except Exception as e: logger.exception( f"Exception on sequence {seq} during question '{param}'" @@ -1068,7 +1088,9 @@ def run_query_model_speed_up( json.dumps(answers_log, indent=2, ensure_ascii=False), encoding="utf-8" ) json_out_path.write_text( - json.dumps({s: d for (s, d) in described_sequences}, indent=2, ensure_ascii=False), + json.dumps( + {s: d for (s, d) in described_sequences}, indent=2, ensure_ascii=False + ), encoding="utf-8", ) return described_sequences @@ -2142,9 +2164,12 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - strict_sequences: Set[str] = set(map(lambda s: s.upper(), outputs.get("SeqPrompt_strict", []))) - nonstrict_sequences: Set[str] = set(map(lambda s: s.upper(), outputs.get("SeqPrompt", []))) - + strict_sequences: Set[str] = set( + map(lambda s: s.upper(), outputs.get("SeqPrompt_strict", [])) + ) + nonstrict_sequences: Set[str] = set( + map(lambda s: s.upper(), outputs.get("SeqPrompt", [])) + ) all_found_sequences = list( sorted( @@ -2175,7 +2200,6 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - optimized_sequence_descriptors = run_query_model_speed_up( model=model, # not used in the fast version but kept for signature compatibility article_text=article_text, @@ -2188,10 +2212,9 @@ def run_project(project_dir: str | Path) -> None: model_name=model_name, tqdm_position=2, client=client, # <-- important: pass the raw ollama.Client - chat_prompts="optimized" + chat_prompts="optimized", ) - stamp = _now_stamp() full_dir = out_base / "json_full" full_dir.mkdir(parents=True, exist_ok=True) @@ -2200,7 +2223,9 @@ def run_project(project_dir: str | Path) -> None: / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OPTIM__{stamp}.json" ) full_seq_desc_path.write_text( - json.dumps(optimized_sequence_descriptors, indent=2, ensure_ascii=False), + json.dumps( + optimized_sequence_descriptors, indent=2, ensure_ascii=False + ), encoding="utf-8", ) @@ -2228,7 +2253,6 @@ def run_project(project_dir: str | Path) -> None: f"[DB INSERT SEQDESC OPTIM] stitching failed for {article_name} : {model_name}" ) - my_sequence_descriptors = run_query_model_speed_up( model=model, # not used in the fast version but kept for signature compatibility article_text=article_text, @@ -2241,7 +2265,7 @@ def run_project(project_dir: str | Path) -> None: model_name=model_name, tqdm_position=2, client=client, # <-- important: pass the raw ollama.Client - chat_prompts="my" + chat_prompts="my", ) stamp = _now_stamp() @@ -2256,7 +2280,6 @@ def run_project(project_dir: str | Path) -> None: encoding="utf-8", ) - try: # Optional DB insert if cfg.db_path: @@ -2294,7 +2317,6 @@ def run_project(project_dir: str | Path) -> None: tqdm_position=2, ) - stamp = _now_stamp() full_dir = out_base / "json_full" full_dir.mkdir(parents=True, exist_ok=True) @@ -2331,7 +2353,6 @@ def run_project(project_dir: str | Path) -> None: f"[DB INSERT SEQDESC OLD] stitching failed for {article_name} : {model_name}" ) - sequence_descriptors: List[Tuple[str, Dict[str, Any]]] = [] sequence_descriptors.extend(optimized_sequence_descriptors) sequence_descriptors.extend(my_sequence_descriptors) @@ -2349,7 +2370,6 @@ def run_project(project_dir: str | Path) -> None: encoding="utf-8", ) - for i, seq in enumerate( tqdm( all_found_sequences, @@ -2382,8 +2402,6 @@ def run_project(project_dir: str | Path) -> None: f"Pass failed: {p.name} : {article_name} : {model_name}" ) - - # Stitch only if the expected pass names are present try: A = outputs.get("A_core", {}) From 5a89f893363d132dc752c0a42e1ad68f8a58f634 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 04:26:24 +0400 Subject: [PATCH 076/102] Tiny typo fixed --- extraction/pipeline_pre_quest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 632975b..858ebfc 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1053,7 +1053,7 @@ def run_query_model_speed_up( {"sequence": seq, "param": param, "response": obj} ) - fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\n```.", + fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\n```." format_fixed_raw_json = think_generate( model=model, model_input=fix_query, From 5df53571417ab14637e7c11fd1640c761fc75a29 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 13:55:06 +0400 Subject: [PATCH 077/102] Update seq --- extraction/hyb_db.py | 2 +- extraction/pipeline_pre_quest.py | 33 ++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index ee03fc1..27f61de 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -314,7 +314,7 @@ def insert_article_object(db_path: str, article_obj: Dict[str, Any], _ensure_schema(conn) cur = conn.cursor() - doi = article_obj.get("doi") + doi = article_obj.get("doi", "unknown") if not doi: raise ValueError("Input must contain a top-level 'doi' string.") diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 858ebfc..3e20af2 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -533,7 +533,8 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), ( @@ -543,7 +544,8 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", + #"pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), ( @@ -629,7 +631,8 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), ( @@ -644,13 +647,15 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, "reverse": { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, }, }, @@ -1054,13 +1059,17 @@ def run_query_model_speed_up( ) fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\n```." - format_fixed_raw_json = think_generate( - model=model, - model_input=fix_query, - logger=logger, - output_type=JsonSchema(schema=schema), - think=True, - ) + try: + format_fixed_raw_json = think_generate( + model=model, + model_input=fix_query, + logger=logger, + output_type=JsonSchema(schema=schema), + think=True, + ) + except ollama.ResponseError: + logger.exception(f"Error on model {model.model_name}, sequence {seq}, query {query} and prompts {chat_prompts}") + print("", flush=True) # Persist logs with open(raw_txt_path, mode="at", encoding="utf-8") as f: From 116d5b7c3cffa0476e76f0f99ee97311c104af52 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 9 Oct 2025 14:08:31 +0400 Subject: [PATCH 078/102] Prompt slightly changed --- extraction/pipeline_pre_quest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 3e20af2..0a315f7 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1058,7 +1058,7 @@ def run_query_model_speed_up( {"sequence": seq, "param": param, "response": obj} ) - fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\n```." + fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." try: format_fixed_raw_json = think_generate( model=model, From c9399693dbc3a743cf81a6ad2ec654cc7251a7bb Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 10 Oct 2025 00:48:49 +0400 Subject: [PATCH 079/102] Fixed errors in tuples --- extraction/pipeline_pre_quest.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 0a315f7..9f5959b 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1058,11 +1058,15 @@ def run_query_model_speed_up( {"sequence": seq, "param": param, "response": obj} ) - fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + #fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + #fix_query = f"Rewrite the object {raw_json} in the new schema. Return null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + fix_chat = outlines.inputs.Chat() + fix_chat.add_system_message(prompt + f"\nIn this chat you have to transform the user-provided JSON object to match the following schema:\n```json\n{json.dumps(schema)}\n```\n.If user provided-data is not enough to fill-in some fields, put null value in them, but try harder to transform as much data to the new schema as possible.") + fix_chat.add_user_message(raw_json) try: format_fixed_raw_json = think_generate( model=model, - model_input=fix_query, + model_input=fix_chat, logger=logger, output_type=JsonSchema(schema=schema), think=True, @@ -1073,7 +1077,7 @@ def run_query_model_speed_up( # Persist logs with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> {fix_query}\n< {format_fixed_raw_json}\n\n") + f.write(f"> {'\n'.join(fix_chat.messages)}\n< {format_fixed_raw_json}\n\n") format_fixed = repair_json(format_fixed_raw_json) fixed_obj = json.loads(format_fixed) @@ -1727,7 +1731,7 @@ def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): sequence_descriptor = parse_sequence( seq, base_chat=base_chat_with_sequence ) - described_sequences.append(seq, sequence_descriptor) + described_sequences.append((seq, sequence_descriptor)) answers.append( {"sequence": seq, "sequence_descriptor": sequence_descriptor} ) From b1c0b4d058d798b9f6fcacf00b10d63c73bbef9d Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 10 Oct 2025 00:50:22 +0400 Subject: [PATCH 080/102] f-string bug fixed --- extraction/pipeline_pre_quest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 9f5959b..2403a1a 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1076,8 +1076,9 @@ def run_query_model_speed_up( print("", flush=True) # Persist logs + msgs = '\n'.join(fix_chat.messages) with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> {'\n'.join(fix_chat.messages)}\n< {format_fixed_raw_json}\n\n") + f.write(f"> {msgs}\n< {format_fixed_raw_json}\n\n") format_fixed = repair_json(format_fixed_raw_json) fixed_obj = json.loads(format_fixed) From 6ed64439a074d19ffaec3851af8bc6b47614d4eb Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 10 Oct 2025 00:54:15 +0400 Subject: [PATCH 081/102] Bugfix messages --- extraction/pipeline_pre_quest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 2403a1a..73c8e1a 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1076,9 +1076,9 @@ def run_query_model_speed_up( print("", flush=True) # Persist logs - msgs = '\n'.join(fix_chat.messages) + #msgs = '\n'.join(map(lambda k,v: "\n".join([f"{k}: {v}"]), fix_chat.messages)) with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> {msgs}\n< {format_fixed_raw_json}\n\n") + f.write(f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n") format_fixed = repair_json(format_fixed_raw_json) fixed_obj = json.loads(format_fixed) From 6e045fdc2f51d30bb4d0710b5f03b490ce9f260a Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 10 Oct 2025 00:54:45 +0400 Subject: [PATCH 082/102] Update params to include more models --- extraction/config/pipeline.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 6a7c577..85bb570 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,7 +1,8 @@ { "model_names": [ + "myaniu/qwen2.5-1m:7b", "phi4:14b", - "myaniu/qwen2.5-1m:7b" + "phi3:latest" ], "ollama_parameters": { "num_ctx": 65536, From bd4e8c9c985fb8df5c720a35cae1b6454a95f338 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sat, 11 Oct 2025 01:03:58 +0400 Subject: [PATCH 083/102] Start run for all articles --- extraction/config/pipeline.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 85bb570..7c35d13 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,8 +1,7 @@ { "model_names": [ "myaniu/qwen2.5-1m:7b", - "phi4:14b", - "phi3:latest" + "phi4:14b" ], "ollama_parameters": { "num_ctx": 65536, @@ -12,7 +11,8 @@ }, "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 120, - "input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", + "__input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", + "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", "_input_dir": "input/md", "out_dir": "outlines_output_db", "full_schema_path": "schema/json/article.json", From 3ffa95c3a75361dad69ec0ecf10717f4dc4af9e7 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 01:39:09 +0400 Subject: [PATCH 084/102] Re-run try faster --- extraction/config/pipeline.json | 2 +- extraction/pipeline_pre_quest.py | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 7c35d13..aab5980 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -10,7 +10,7 @@ "seed": 42 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 120, + "timeout_s": 45, "__input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", "_input_dir": "input/md", diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 73c8e1a..87d2f47 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -2318,18 +2318,20 @@ def run_project(project_dir: str | Path) -> None: f"[DB INSERT SEQDESC MY] stitching failed for {article_name} : {model_name}" ) - old_sequence_descriptors = run_query_model( - model=model, - article_text=article_text, - sequences=all_found_sequences, - out_base=out_base, - article_stem=article_name, - common_prompt_path=cfg.common_prompt_path, - ollama_parameters=cfg.ollama_parameters, - logger=logger, - model_name=model_name, - tqdm_position=2, - ) + logger.warning("[SeqDesc-OLD] Parsing old sequence descriptors is disabled in this run.") + old_sequence_descriptors = [] + # old_sequence_descriptors = run_query_model( + # model=model, + # article_text=article_text, + # sequences=all_found_sequences, + # out_base=out_base, + # article_stem=article_name, + # common_prompt_path=cfg.common_prompt_path, + # ollama_parameters=cfg.ollama_parameters, + # logger=logger, + # model_name=model_name, + # tqdm_position=2, + # ) stamp = _now_stamp() full_dir = out_base / "json_full" From a668f07a84f7e3c80c3f1756150b7c33c59660d6 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:03:58 +0400 Subject: [PATCH 085/102] Will pass the whole article text and lower the temperature for the fix transform --- extraction/pipeline_pre_quest.py | 110 ++++++++++++++++++------------- 1 file changed, 65 insertions(+), 45 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 87d2f47..7c151d6 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -37,6 +37,7 @@ from jsonschema import Draft202012Validator from outlines.types import JsonSchema from tqdm import tqdm +from json_repair import repair_json as rep_json API_TOKEN = os.getenv("OPEN_BUTTON_TOKEN", None) @@ -324,7 +325,7 @@ def repair_json(text: str) -> str: end = text.rfind("}") if start == -1 or end == -1 or end <= start: return text - candidate = text[start : end + 1] + candidate = rep_json(text[start : end + 1]) try: json.loads(candidate) return candidate @@ -533,7 +534,7 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), @@ -544,7 +545,7 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - #"pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", + # "pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), @@ -631,7 +632,7 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, ), @@ -647,14 +648,14 @@ def run_query_model_speed_up( "type": ["string", "null"], "minLength": 5, "maxLength": 150, - #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, "reverse": { "type": ["string", "null"], "minLength": 5, "maxLength": 150, - #"pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", }, }, @@ -1012,13 +1013,14 @@ def run_query_model_speed_up( sys_prompt = ( prompt + "\n\nYou will answer a series of short JSON-only questions about a SINGLE candidate probe sequence.\n" - + "You MUST base answers ONLY on this article snippet:\n\n" - + snippet - + "\n\n" - + "Candidate probe:\n\n" + + "You MUST base answers ONLY on this article text:\n
\n" + + article_text + + "\n
\n" + + f"And the most relevant snippet seems to be \nsnippet\n\n\n" + + "The candidate for being a probe sequence is:\n\n" + seq - + "\n\n" - + "Return strictly JSON for each question — no extra commentary." + + "\n
\nAnd you must bow work with only this sequence and all relevant context for it. You will be asked a series of questions about this sequence.\n" + + "Return strictly JSON for each question — no extra commentary. You will receive a JSON schema in each question." ) # Create a fresh stateful session for THIS sequence (keeps context across questions) @@ -1054,40 +1056,56 @@ def run_query_model_speed_up( with open(raw_txt_path, mode="at", encoding="utf-8") as f: f.write(f"> {query}\n< {raw_json}\n\n") - answers_log.append( - {"sequence": seq, "param": param, "response": obj} - ) + validator = Draft202012Validator(json.loads(schema)) - #fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." - #fix_query = f"Rewrite the object {raw_json} in the new schema. Return null if and only if there is not enough data and provided data is insufficient for inferring the request.```." - fix_chat = outlines.inputs.Chat() - fix_chat.add_system_message(prompt + f"\nIn this chat you have to transform the user-provided JSON object to match the following schema:\n```json\n{json.dumps(schema)}\n```\n.If user provided-data is not enough to fill-in some fields, put null value in them, but try harder to transform as much data to the new schema as possible.") - fix_chat.add_user_message(raw_json) - try: - format_fixed_raw_json = think_generate( - model=model, - model_input=fix_chat, - logger=logger, - output_type=JsonSchema(schema=schema), - think=True, + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + # fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + # fix_query = f"Rewrite the object {raw_json} in the new schema. Return null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + fix_chat = outlines.inputs.Chat() + fix_chat.add_system_message( + prompt + + f"\nIn this chat you have to transform the user-provided JSON object to match the following schema:\n```json\n{json.dumps(schema)}\n```\n. If user provided-data is not enough to fill-in some fields, put null value in them, but try harder to transform as much data to the new schema as possible. Please do not modify or invent values by yourself. Just move existing values to the corresponging fields of the schema. Please be thoughtful and careful while doing so!" ) - except ollama.ResponseError: - logger.exception(f"Error on model {model.model_name}, sequence {seq}, query {query} and prompts {chat_prompts}") - print("", flush=True) - - # Persist logs - #msgs = '\n'.join(map(lambda k,v: "\n".join([f"{k}: {v}"]), fix_chat.messages)) - with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n") - - format_fixed = repair_json(format_fixed_raw_json) - fixed_obj = json.loads(format_fixed) - - answers_log.append( - {"sequence": seq, "param": param, "response": fixed_obj} - ) - - seq_desc[param] = fixed_obj + fix_chat.add_user_message(raw_json) + try: + format_fixed_raw_json = think_generate( + model=model, + model_input=fix_chat, + logger=logger, + output_type=JsonSchema(schema=schema), + think=True, + options=ollama_parameters, + ) + except ollama.ResponseError: + logger.exception( + f"Error on model {model.model_name}, sequence {seq}, query {query} and prompts {chat_prompts}" + ) + print("", flush=True) + format_fixed_raw_json = raw_json + + # Persist logs + # msgs = '\n'.join(map(lambda k,v: "\n".join([f"{k}: {v}"]), fix_chat.messages)) + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n") + + format_fixed = repair_json(format_fixed_raw_json) + fixed_obj = json.loads(format_fixed) + + answers_log.append( + { + "sequence": seq, + "param": param, + "response": obj, + "fixed_response": fixed_obj, + } + ) + seq_desc[param] = fixed_obj + else: + answers_log.append( + {"sequence": seq, "param": param, "response": obj} + ) + seq_desc[param] = obj except Exception as e: logger.exception( f"Exception on sequence {seq} during question '{param}'" @@ -2318,7 +2336,9 @@ def run_project(project_dir: str | Path) -> None: f"[DB INSERT SEQDESC MY] stitching failed for {article_name} : {model_name}" ) - logger.warning("[SeqDesc-OLD] Parsing old sequence descriptors is disabled in this run.") + logger.warning( + "[SeqDesc-OLD] Parsing old sequence descriptors is disabled in this run." + ) old_sequence_descriptors = [] # old_sequence_descriptors = run_query_model( # model=model, From 75c7a1d39e4c438b539173a8bc911d9362c8caf3 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:05:28 +0400 Subject: [PATCH 086/102] Validator bug fixed --- extraction/pipeline_pre_quest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 7c151d6..200661e 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1056,8 +1056,7 @@ def run_query_model_speed_up( with open(raw_txt_path, mode="at", encoding="utf-8") as f: f.write(f"> {query}\n< {raw_json}\n\n") - validator = Draft202012Validator(json.loads(schema)) - + validator = Draft202012Validator(schema) errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) if errors: # fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." From 915aa22dffaef95c9b0e9b685e10d410e1c89a5a Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:11:38 +0400 Subject: [PATCH 087/102] Return back to snippets --- extraction/pipeline_pre_quest.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 200661e..78a6c0f 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1013,10 +1013,11 @@ def run_query_model_speed_up( sys_prompt = ( prompt + "\n\nYou will answer a series of short JSON-only questions about a SINGLE candidate probe sequence.\n" - + "You MUST base answers ONLY on this article text:\n
\n" - + article_text - + "\n
\n" - + f"And the most relevant snippet seems to be \nsnippet\n\n\n" + #+ "You MUST base answers ONLY on this article text:\n
\n" + #+ article_text + #+ "\n
\n" + #+ f"And the most relevant snippet seems to be \nsnippet\n\n\n" + + f"You MUST base answers ONLY on this article snipet: \nsnippet\n\n\n" + "The candidate for being a probe sequence is:\n\n" + seq + "\n\nAnd you must bow work with only this sequence and all relevant context for it. You will be asked a series of questions about this sequence.\n" From bdeafa9f13c11020358b295944eb02fe4e914128 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:18:00 +0400 Subject: [PATCH 088/102] Try to easy-fix an object --- extraction/pipeline_pre_quest.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 78a6c0f..2938991 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1060,6 +1060,26 @@ def run_query_model_speed_up( validator = Draft202012Validator(schema) errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) if errors: + try: + expected_type = schema.get("type") + if expected_type is not None and (expected_type == "string" or "string" in set(expected_type)): + probable_value = str(obj.get("value", obj.get("type"))) + if probable_value is not None: + validator_easy = Draft202012Validator(schema) + errors_easy = sorted(validator.iter_errors(probable_value), key=lambda er: er.path) + if not errors_easy: + answers_log.append( + { + "sequence": seq, + "param": param, + "response": obj, + "fixed_response": probable_value, + } + ) + seq_desc[param] = probable_value + continue + except Exception as e: + logger.exception("Failed to easily-fix an object") # fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." # fix_query = f"Rewrite the object {raw_json} in the new schema. Return null if and only if there is not enough data and provided data is insufficient for inferring the request.```." fix_chat = outlines.inputs.Chat() From 2bef036a1e57eac51ed7aff89e9bc228db724149 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:20:14 +0400 Subject: [PATCH 089/102] Easy validation fixed --- extraction/pipeline_pre_quest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 2938991..eb5612f 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1066,8 +1066,10 @@ def run_query_model_speed_up( probable_value = str(obj.get("value", obj.get("type"))) if probable_value is not None: validator_easy = Draft202012Validator(schema) - errors_easy = sorted(validator.iter_errors(probable_value), key=lambda er: er.path) + errors_easy = sorted(validator_easy.iter_errors(probable_value), key=lambda er: er.path) if not errors_easy: + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> FIX_EASY\n< {probable_value}\n\n") answers_log.append( { "sequence": seq, From 98c6ba323b25d6b6e41a202bfeac7d47055bb57e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:23:06 +0400 Subject: [PATCH 090/102] Easy-fix sould be even more applicable --- extraction/pipeline_pre_quest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index eb5612f..081932f 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1062,7 +1062,7 @@ def run_query_model_speed_up( if errors: try: expected_type = schema.get("type") - if expected_type is not None and (expected_type == "string" or "string" in set(expected_type)): + if expected_type is not None and (expected_type != "object"): probable_value = str(obj.get("value", obj.get("type"))) if probable_value is not None: validator_easy = Draft202012Validator(schema) From 83d05af5ed6334f177021c3b25cb8b06c774d8d9 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:27:08 +0400 Subject: [PATCH 091/102] Try adding full article, not snippets --- extraction/pipeline_pre_quest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 081932f..1d5ace8 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1013,11 +1013,11 @@ def run_query_model_speed_up( sys_prompt = ( prompt + "\n\nYou will answer a series of short JSON-only questions about a SINGLE candidate probe sequence.\n" - #+ "You MUST base answers ONLY on this article text:\n
\n" - #+ article_text - #+ "\n
\n" - #+ f"And the most relevant snippet seems to be \nsnippet\n\n\n" - + f"You MUST base answers ONLY on this article snipet: \nsnippet\n\n\n" + + "You MUST base answers ONLY on this article text:\n
\n" + + article_text + + "\n
\n" + + f"And the most relevant snippet seems to be \nsnippet\n\n\n" + # + f"You MUST base answers ONLY on this article snipet: \nsnippet\n\n\n" + "The candidate for being a probe sequence is:\n\n" + seq + "\n\nAnd you must bow work with only this sequence and all relevant context for it. You will be asked a series of questions about this sequence.\n" From 977294204556e803fde0b014dce2b410df7a4af1 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:31:05 +0400 Subject: [PATCH 092/102] Try making simple fixer even more robust --- extraction/pipeline_pre_quest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 1d5ace8..4115358 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1064,6 +1064,11 @@ def run_query_model_speed_up( expected_type = schema.get("type") if expected_type is not None and (expected_type != "object"): probable_value = str(obj.get("value", obj.get("type"))) + try: + keys = obj.keys() + probable_value = obj[keys[-1]] + except: + pass if probable_value is not None: validator_easy = Draft202012Validator(schema) errors_easy = sorted(validator_easy.iter_errors(probable_value), key=lambda er: er.path) From 8a68061678de941d4fac135a2f31282184d67456 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:32:17 +0400 Subject: [PATCH 093/102] Remove non-strict seq pass --- extraction/config/pipeline.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index aab5980..30f5910 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -19,7 +19,7 @@ "common_prompt_path": "passes/common.txt", "db_path": "outlines_output_db/massive.sqlite", "article_glob": "**/*.md", - "pre_passes": [ + "_pre_passes": [ { "name": "SeqPrompt", "schema": "passes/_1_SeqPrompt/schema.json", @@ -33,6 +33,14 @@ "timeout": 60 } ], + "pre_passes": [ + { + "name": "SeqPrompt_strict", + "schema": "passes/_1_SeqPrompt/schema_strict.json", + "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", + "timeout": 60 + } + ], "construct_single_experiment_passes": [], "passes": [ { From dfcab0d8cb10b0dba98714d020ef5ab6c785d452 Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Sun, 12 Oct 2025 02:38:18 +0400 Subject: [PATCH 094/102] Seems that easy fixer should take first mapping --- extraction/pipeline_pre_quest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 4115358..9b8cf3f 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1066,7 +1066,7 @@ def run_query_model_speed_up( probable_value = str(obj.get("value", obj.get("type"))) try: keys = obj.keys() - probable_value = obj[keys[-1]] + probable_value = obj[keys[0]] except: pass if probable_value is not None: From 51d23090bc12f4fef6668769bdbd8d0ea105550f Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 16 Oct 2025 03:48:24 +0400 Subject: [PATCH 095/102] Add other non-qwen-1m models --- extraction/config/pipeline.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 30f5910..db393ca 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,7 +1,9 @@ { "model_names": [ - "myaniu/qwen2.5-1m:7b", - "phi4:14b" + "phi4-mini-reasoning:latest", + "gemma3:4b", + "phi4:14b", + "gemma3:27b" ], "ollama_parameters": { "num_ctx": 65536, From 3e3d4093207c45402315cf6288911e683ae1895f Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Thu, 16 Oct 2025 03:56:06 +0400 Subject: [PATCH 096/102] Add more passes --- extraction/config/pipeline.json | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index db393ca..9df16ec 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -50,6 +50,66 @@ "schema": "passes/A_core/schema.json", "prompt": "passes/A_core/prompt.txt", "timeout": 60 + }, + { + "name": "B_index", + "schema": "passes/B_index/schema.json", + "prompt": "passes/B_index/prompt.txt", + "timeout": 600 + }, + { + "name": "B1_index_types", + "schema": "passes/B1_index_types/schema.json", + "prompt": "passes/B1_index_types/prompt.txt", + "timeout": 600 + }, + { + "name": "B2_index_desc", + "schema": "passes/B2_index_desc/schema.json", + "prompt": "passes/B2_index_desc/prompt.txt", + "timeout": 600 + }, + { + "name": "C5_probes_opt_target", + "schema": "passes/C5_probes_opt_target/schema.json", + "prompt": "passes/C5_probes_opt_target/prompt.txt", + "timeout": 900 + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt", + "timeout": 900 + }, + { + "name": "C1_probe_core", + "schema": "passes/C1_probe_core/schema.json", + "prompt": "passes/C1_probe_core/prompt.txt" + }, + { + "name": "C2_target_primers", + "schema": "passes/C2_target_primers/schema.json", + "prompt": "passes/C2_target_primers/prompt.txt" + }, + { + "name": "C3_related", + "schema": "passes/C3_related/schema.json", + "prompt": "passes/C3_related/prompt.txt" + }, + { + "name": "D_parameters", + "schema": "passes/D_parameters/schema.json", + "prompt": "passes/D_parameters/prompt.txt" + }, + { + "name": "E_outcomes", + "schema": "passes/E_outcomes/schema.json", + "prompt": "passes/E_outcomes/prompt.txt" + }, + { + "name": "F_pairings", + "schema": "passes/F_pairings/schema.json", + "prompt": "passes/F_pairings/prompt.txt" } ], "ignored_passes": [ From 72d686bfe464cd5f05f0bb6ae952fb8aad56cfdd Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 31 Oct 2025 01:09:59 +0400 Subject: [PATCH 097/102] Intermediate hyb_db perfmetrics added --- extraction/hyb_db.py | 94 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index 27f61de..bdd02c7 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -24,6 +24,9 @@ Public API: init_db(db_path) insert_article_object(db_path, article_obj, model_name, article_name) + insert_seqdesc_object(...) + insert_perf_event(db_path, event_dict) + insert_perf_events(db_path, [event_dict, ...]) Features: - Auto-initializes schema (tables, indexes, views). @@ -31,6 +34,7 @@ - Normalizes sense/antisense & prime markers. - Guards against non-oligo "probes" (skips probe insertion but keeps experiment). - Includes Ollama-style helper tools with Google docstrings. +- NEW: perf_events table for timings/tokens of every step/question. """ import json import re @@ -219,6 +223,28 @@ def _db(db_path: str): FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS idx_no_seq_run ON no_sequences_explanations(run_id); + +/* NEW: generic performance/timing/token metrics for all steps and questions */ +CREATE TABLE IF NOT EXISTS perf_events ( + id INTEGER PRIMARY KEY, + namespace TEXT NOT NULL CHECK (namespace IN ('pre_pass','pass','query','construct','stitch','db_insert','other')), + article_name TEXT, + model_name TEXT, + article_doi TEXT, + pass_name TEXT, + sequence_key TEXT, + question_param TEXT, + started_at TEXT, + finished_at TEXT, + duration_ms REAL, + prompt_tokens INTEGER, + completion_tokens INTEGER, + total_tokens INTEGER, + tokens_per_sec REAL, + sidecar_path TEXT, + notes TEXT +); +CREATE INDEX IF NOT EXISTS idx_perf_ns_article_model ON perf_events(namespace, article_name, model_name); """ _VIEWS_SQL = """ @@ -475,6 +501,72 @@ def insert_article_object(db_path: str, article_obj: Dict[str, Any], return run_id +# NEW: perf events API -------------------------------------------------- # + +def _event_defaults(ev: Dict[str, Any]) -> Dict[str, Any]: + d = dict(ev or {}) + for k in ("namespace","article_name","model_name","article_doi","pass_name", + "sequence_key","question_param","started_at","finished_at", + "duration_ms","prompt_tokens","completion_tokens","total_tokens", + "tokens_per_sec","sidecar_path","notes"): + d.setdefault(k, None) + return d + +def insert_perf_event(db_path: str, event: Dict[str, Any]) -> int: + """Insert a single performance/timing event row.""" + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + e = _event_defaults(event) + cur.execute( + """ + INSERT INTO perf_events ( + namespace, article_name, model_name, article_doi, pass_name, + sequence_key, question_param, started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + sidecar_path, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + e["namespace"], e["article_name"], e["model_name"], e["article_doi"], e["pass_name"], + e["sequence_key"], e["question_param"], e["started_at"], e["finished_at"], e["duration_ms"], + e["prompt_tokens"], e["completion_tokens"], e["total_tokens"], e["tokens_per_sec"], + e["sidecar_path"], e["notes"] + ), + ) + return cur.lastrowid + +def insert_perf_events(db_path: str, events: List[Dict[str, Any]]) -> List[int]: + """Bulk insert multiple performance events.""" + ids: List[int] = [] + if not events: + return ids + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + for ev in events: + e = _event_defaults(ev) + cur.execute( + """ + INSERT INTO perf_events ( + namespace, article_name, model_name, article_doi, pass_name, + sequence_key, question_param, started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + sidecar_path, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + e["namespace"], e["article_name"], e["model_name"], e["article_doi"], e["pass_name"], + e["sequence_key"], e["question_param"], e["started_at"], e["finished_at"], e["duration_ms"], + e["prompt_tokens"], e["completion_tokens"], e["total_tokens"], e["tokens_per_sec"], + e["sidecar_path"], e["notes"] + ), + ) + ids.append(cur.lastrowid) + return ids + # ----------------------------- Ollama-style helper tools ----------------------------- # def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: @@ -487,7 +579,7 @@ def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Args: value: The numeric value parsed from the article, or None if unknown. - unit: The unit string as written in the article (e.g., '°C', 'C', 'mM', 'µM', 'nM', '%'), or None. + unit: The unit string as written in the article (e.g., '°C', 'C', 'mM', '%'), or None. Returns: A pair (si_value, si_unit): From e99266a9465ab7279a3eb6984912bff3959eea5f Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 31 Oct 2025 01:22:58 +0400 Subject: [PATCH 098/102] Upgraded solution for continuation and performance metrics --- extraction/hyb_db.py | 341 +++++++++++++++++++- extraction/pipeline_pre_quest.py | 521 +++++++++++++++++++++++++++++-- 2 files changed, 832 insertions(+), 30 deletions(-) diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index bdd02c7..6c3333a 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -12,7 +12,7 @@ from contextlib import contextmanager, closing from datetime import datetime, timezone from loguru import logger -from ollama import chat, ChatResponse +from ollama import chat, ChatResponse from json_repair import repair_json import os, sys from jsonschema import Draft202012Validator @@ -23,19 +23,58 @@ Public API: init_db(db_path) + + # HYBRIDIZATION ARTICLE / EXPERIMENT STRUCTURE insert_article_object(db_path, article_obj, model_name, article_name) insert_seqdesc_object(...) + + # PERFORMANCE / PIPELINE METRICS insert_perf_event(db_path, event_dict) insert_perf_events(db_path, [event_dict, ...]) + # NEW (ADDED FOR PIPELINE CONTINUATION + SIDE-CAR PERF TRACKING) + insert_pipeline_artifact(db_path, artifact_dict) + get_pipeline_artifacts_for_article(db_path, model_name, article_name) + get_completed_passes(db_path, model_name, article_name) + Features: - Auto-initializes schema (tables, indexes, views). - Preserves every run (no overwrites). - Normalizes sense/antisense & prime markers. - Guards against non-oligo "probes" (skips probe insertion but keeps experiment). - Includes Ollama-style helper tools with Google docstrings. -- NEW: perf_events table for timings/tokens of every step/question. +- perf_events table for timings/tokens of every step/question. +- NEW: pipeline_artifacts table for per-pass / per-file sidecar metrics and + continuation bookkeeping. Each JSON artifact the pipeline writes on disk + (per pass, per article, per model) can have a "sidecar" JSON with timing + and token usage. We mirror that data into pipeline_artifacts so that: + * downstream QC / benchmarking code can query timings and tokens + * the pipeline can resume/continue work by checking which passes for a + given (model_name, article_name) have already succeeded. + The pipeline will: + - emit a sidecar JSON next to every produced .json/.log.json/etc. file + containing timing, token counts, and file paths + - call insert_pipeline_artifact(...) with the same metadata + Because this module always calls _ensure_schema() on connect, the new table + will be created automatically in older existing DBs without migration steps. + +Tables overview +--------------- +articles / runs / raw_payloads / experiments / ... : + Structured hybridization experiment data (final stitched objects). + +seqdesc_* : + Per-sequence descriptors from sequence descriptor passes. + +perf_events : + Fine-grained timing and token usage for any granular step/question. + +pipeline_artifacts : + Coarse-grained artifact-level bookkeeping used for: + - sidecar perf metrics per produced JSON artifact + - continuation / resume logic (which pipeline passes already finished) """ + import json import re import sqlite3 @@ -224,7 +263,7 @@ def _db(db_path: str): ); CREATE INDEX IF NOT EXISTS idx_no_seq_run ON no_sequences_explanations(run_id); -/* NEW: generic performance/timing/token metrics for all steps and questions */ +/* generic performance/timing/token metrics for all steps and questions */ CREATE TABLE IF NOT EXISTS perf_events ( id INTEGER PRIMARY KEY, namespace TEXT NOT NULL CHECK (namespace IN ('pre_pass','pass','query','construct','stitch','db_insert','other')), @@ -245,6 +284,36 @@ def _db(db_path: str): notes TEXT ); CREATE INDEX IF NOT EXISTS idx_perf_ns_article_model ON perf_events(namespace, article_name, model_name); + +/* NEW TABLE: + pipeline_artifacts captures artifact-level metadata (per produced JSON file). + It is designed for: + - performance sidecar ingestion (duration, token counts, sidecar paths) + - resume/continuation bookkeeping (which passes are already done) + The UNIQUE constraint prevents exact duplicate rows for the same (model,article,pass,file), + but allows multiple historical attempts over time if artifact_path differs + (timestamps are embedded in filenames in the pipeline). +*/ +CREATE TABLE IF NOT EXISTS pipeline_artifacts ( + id INTEGER PRIMARY KEY, + model_name TEXT NOT NULL, + article_name TEXT NOT NULL, + pass_name TEXT NOT NULL, + artifact_path TEXT NOT NULL, + sidecar_path TEXT, + started_at TEXT, + finished_at TEXT, + duration_ms REAL, + prompt_tokens INTEGER, + completion_tokens INTEGER, + total_tokens INTEGER, + tokens_per_sec REAL, + success INTEGER, -- NULL/0/1 + notes TEXT, + UNIQUE (model_name, article_name, pass_name, artifact_path) +); +CREATE INDEX IF NOT EXISTS idx_pa_lookup ON pipeline_artifacts(model_name, article_name, pass_name); +CREATE INDEX IF NOT EXISTS idx_pa_finished ON pipeline_artifacts(finished_at); """ _VIEWS_SQL = """ @@ -513,7 +582,28 @@ def _event_defaults(ev: Dict[str, Any]) -> Dict[str, Any]: return d def insert_perf_event(db_path: str, event: Dict[str, Any]) -> int: - """Insert a single performance/timing event row.""" + """Insert a single performance/timing event row. + + Typical usage: + insert_perf_event(db_path, { + "namespace": "pass", + "article_name": "paper123", + "model_name": "my/model:7b", + "article_doi": "10.xxxx/yyy", + "pass_name": "A_core", + "sequence_key": None, + "question_param": None, + "started_at": "...", + "finished_at": "...", + "duration_ms": 1234.5, + "prompt_tokens": 4567, + "completion_tokens": 890, + "total_tokens": 5457, + "tokens_per_sec": 12.34, + "sidecar_path": "/path/to/file.sidecar.json", + "notes": "ok" + }) + """ with _db(db_path) as conn: _ensure_schema(conn) cur = conn.cursor() @@ -567,6 +657,249 @@ def insert_perf_events(db_path: str, events: List[Dict[str, Any]]) -> List[int]: ids.append(cur.lastrowid) return ids + +# NEW: pipeline_artifacts API ------------------------------------------ # + +def _artifact_defaults(rec: Dict[str, Any]) -> Dict[str, Any]: + """Normalize/complete an artifact record dict before DB insert. + + Expected keys in `rec`: + model_name : str + article_name : str + pass_name : str (e.g. 'A_core', 'SeqDesc-OPTIM', 'FULL') + artifact_path : str (absolute or project-rel path to main JSON) + sidecar_path : str|None (path to sidecar .perf.json) + started_at : str|None (ISO8601 UTC) + finished_at : str|None (ISO8601 UTC) + duration_ms : float|None + prompt_tokens : int|None + completion_tokens : int|None + total_tokens : int|None + tokens_per_sec : float|None + success : bool|int|None + notes : str|None + """ + d = dict(rec or {}) + for k in ( + "model_name", "article_name", "pass_name", "artifact_path", + "sidecar_path", "started_at", "finished_at", "duration_ms", + "prompt_tokens", "completion_tokens", "total_tokens", "tokens_per_sec", + "success", "notes" + ): + d.setdefault(k, None) + + # Convert success -> int bool (1/0/NULL) + d["success"] = _to_int_bool(d.get("success")) + return d + + +def insert_pipeline_artifact(db_path: str, artifact: Dict[str, Any]) -> int: + """Insert a single pipeline_artifacts row. + + This captures per-pass / per-article / per-model artifact bookkeeping + (timings + token usage for the JSON file) as well as marking a pass + as 'successfully finished' for continuation. + + NOTE: + We do a plain INSERT. The table has a UNIQUE constraint on + (model_name, article_name, pass_name, artifact_path), so the pipeline + should generate unique artifact_path names (it already includes a + timestamped suffix in filenames). We intentionally do NOT overwrite. + + Returns: + row_id (int). + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + r = _artifact_defaults(artifact) + cur.execute( + """ + INSERT INTO pipeline_artifacts ( + model_name, article_name, pass_name, artifact_path, sidecar_path, + started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + success, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + r["model_name"], + r["article_name"], + r["pass_name"], + r["artifact_path"], + r["sidecar_path"], + r["started_at"], + r["finished_at"], + r["duration_ms"], + r["prompt_tokens"], + r["completion_tokens"], + r["total_tokens"], + r["tokens_per_sec"], + r["success"], + r["notes"], + ), + ) + return cur.lastrowid + + +def get_pipeline_artifacts_for_article( + db_path: str, + model_name: str, + article_name: str, +) -> List[Dict[str, Any]]: + """Fetch all recorded artifacts for (model_name, article_name). + + This is useful for: + - debugging + - external QC scripts + - continuation logic in the pipeline (to see what's already done) + + Returns: + A list (possibly empty) of dicts, newest-first by finished_at (NULL last). + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + cur.execute( + """ + SELECT + id, + model_name, + article_name, + pass_name, + artifact_path, + sidecar_path, + started_at, + finished_at, + duration_ms, + prompt_tokens, + completion_tokens, + total_tokens, + tokens_per_sec, + success, + notes + FROM pipeline_artifacts + WHERE model_name = ? + AND article_name = ? + ORDER BY + CASE WHEN finished_at IS NULL THEN 1 ELSE 0 END, + finished_at DESC + """, + (model_name, article_name), + ) + rows = cur.fetchall() + + out_rows: List[Dict[str, Any]] = [] + for row in rows: + ( + rid, mname, aname, pass_name, artifact_path, sidecar_path, + started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + success, notes + ) = row + out_rows.append( + { + "id": rid, + "model_name": mname, + "article_name": aname, + "pass_name": pass_name, + "artifact_path": artifact_path, + "sidecar_path": sidecar_path, + "started_at": started_at, + "finished_at": finished_at, + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "success": bool(success) if success is not None else None, + "notes": notes, + } + ) + return out_rows + + +def get_completed_passes( + db_path: str, + model_name: str, + article_name: str, +) -> Dict[str, Dict[str, Any]]: + """Return a summary of which logical passes have already completed + successfully for (model_name, article_name). + + The pipeline can use this to implement continuation / resume: + - If a pass_name appears here with success True, the pipeline MAY skip + regenerating that pass for that (model, article), unless --fresh was + requested or the pipeline.json layout changed and the user wants that + pass rerun anyway. + + Returns: + dict: + { + "A_core": { + "finished_at": "...", + "artifact_path": ".../paper__A_core__model__timestamp.json", + "sidecar_path": ".../paper__A_core__model__timestamp.perf.json", + "duration_ms": 1234.5, + "total_tokens": 9876, + ... + }, + ... + } + + If multiple rows exist for the same pass_name we pick the most recent row + with success == 1 (by finished_at DESC). If none are successful for a + pass, that pass won't appear in the dict. + """ + artifacts = get_pipeline_artifacts_for_article( + db_path=db_path, model_name=model_name, article_name=article_name + ) + + best: Dict[str, Dict[str, Any]] = {} + for row in artifacts: + pname = row["pass_name"] + if not row.get("success"): + continue + prev = best.get(pname) + if prev is None: + best[pname] = row + continue + # pick newer finished_at + prev_finished = prev.get("finished_at") + cur_finished = row.get("finished_at") + # if prev_finished is None but cur_finished not None -> prefer current + # if both not None -> compare lexicographically (ISO8601 so OK) + take_current = False + if prev_finished is None and cur_finished is not None: + take_current = True + elif prev_finished is not None and cur_finished is not None: + if str(cur_finished) > str(prev_finished): + take_current = True + elif prev_finished is None and cur_finished is None: + # keep first, arbitrary + take_current = False + # else prev has finished_at but current doesn't — keep prev. + if take_current: + best[pname] = row + + # Only expose a shallow summary for convenience + summarized: Dict[str, Dict[str, Any]] = {} + for pname, row in best.items(): + summarized[pname] = { + "finished_at": row.get("finished_at"), + "artifact_path": row.get("artifact_path"), + "sidecar_path": row.get("sidecar_path"), + "duration_ms": row.get("duration_ms"), + "prompt_tokens": row.get("prompt_tokens"), + "completion_tokens": row.get("completion_tokens"), + "total_tokens": row.get("total_tokens"), + "tokens_per_sec": row.get("tokens_per_sec"), + "notes": row.get("notes"), + } + return summarized + + # ----------------------------- Ollama-style helper tools ----------------------------- # def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 9b8cf3f..6818a7a 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -9,12 +9,40 @@ - Stitches pass outputs into a full object, validates against full schema (if provided), and optionally inserts into SQLite via hyb_db.insert_article_object. +NEW FEATURES ADDED: +1. Performance metrics / sidecar files: + - For every JSON artifact we emit, we now also write a ".perf.json" + sidecar with: + - start/end timestamps + - wallclock duration_ms + - token usage (if available from Ollama; otherwise nulls) + - simple throughput + - model_name / article_name / pass_name + - artifact path + These metrics are also mirrored into SQLite (if cfg.db_path is not None) + using hyb_db.insert_pipeline_artifact(...), with automatic table creation. + - This allows downstream automated QC / benchmarking. + +2. Continuation / resume: + - The pipeline can resume across runs without losing progress. + - Unless --fresh is passed, we will SKIP any (model_name, article_name) + pair that is already marked complete for pass_name "FULL" in the DB + (hyb_db.get_completed_passes()). + - This satisfies the requirement that we do not reprocess already-finished + articles on rerun. + - Re-doing the _current_ interrupted article from scratch is acceptable + (the spec explicitly allows reparsing one article). + + Requirements: - pip install outlines ollama jsonschema tqdm + pip install outlines ollama jsonschema tqdm json_repair loguru Usage (script): from pipeline_filedriven import run_project - run_project("your_project_dir") + run_project("your_project_dir", fresh=False) + +CLI: + python pipeline_filedriven.py [--fresh] The project_dir must contain (by default): config/pipeline.json @@ -27,8 +55,9 @@ import logging import re import os, sys +import time from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Set, Tuple @@ -196,6 +225,105 @@ def _make_logger(log_dir: Path) -> logging.Logger: return logger +# ────────────────────────────────────────────────────────────────────── +# NEW: Perf / sidecar helpers (feature 1) and continuation helpers (feature 2) +# ────────────────────────────────────────────────────────────────────── + + +def _write_perf_sidecar_and_db( + *, + artifact_path: Path, + pass_name: str, + model_name: str, + article_name: str, + start_time: datetime, + end_time: datetime, + prompt_tokens: Optional[int], + completion_tokens: Optional[int], + db_path: Optional[Path], + logger: logging.Logger, + notes: Optional[str] = None, +) -> None: + """Write .perf.json sidecar and mirror metrics into SQLite. + + - duration_ms: wallclock elapsed between start_time and end_time + - prompt_tokens / completion_tokens: from Ollama metadata when available + - total_tokens / tokens_per_sec: derived + - if db_path is provided, also insert a row into hyb_db.pipeline_artifacts + (auto-creates tables if needed) + + This function never raises; it logs exceptions instead. + """ + try: + duration_ms = (end_time - start_time).total_seconds() * 1000.0 + except Exception: + duration_ms = None + + total_tokens = None + if (prompt_tokens is not None) or (completion_tokens is not None): + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + + tokens_per_sec = None + try: + if total_tokens is not None and duration_ms and duration_ms > 0: + tokens_per_sec = total_tokens / (duration_ms / 1000.0) + except Exception: + tokens_per_sec = None + + sidecar_dict = { + "model_name": model_name, + "article_name": article_name, + "pass_name": pass_name, + "artifact_path": str(artifact_path), + "started_at": start_time.isoformat(), + "finished_at": end_time.isoformat(), + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "notes": notes, + } + + sidecar_path = Path(str(artifact_path) + ".perf.json") + try: + sidecar_path.write_text( + json.dumps(sidecar_dict, indent=2, ensure_ascii=False), encoding="utf-8" + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to write sidecar for {artifact_path}: {repr(e)}" + ) + + if db_path: + try: + from hyb_db import insert_pipeline_artifact + + insert_pipeline_artifact( + db_path=str(db_path), + artifact={ + "model_name": model_name, + "article_name": article_name, + "pass_name": pass_name, + "artifact_path": str(artifact_path), + "sidecar_path": str(sidecar_path), + "started_at": start_time.isoformat(), + "finished_at": end_time.isoformat(), + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "success": True, + "notes": notes, + }, + ) + except Exception as e: + logger.exception( + f"[PERF][DB] insert_pipeline_artifact failed for {artifact_path}: {repr(e)}" + ) + + # ────────────────────────────────────────────────────────────────────── # Tools (Ollama helpers) — Google-style docstrings # ────────────────────────────────────────────────────────────────────── @@ -351,6 +479,10 @@ class OllamaJSONChat: We seed once with a system prompt (includes article snippet & sequence), then for each question we call generate() with only the new instruction and pass the returned `context` back in. + + NEW: + - self._last_meta stores token / timing metadata from the most recent + .ask_json() call so we can accumulate perf stats. """ def __init__( @@ -370,9 +502,10 @@ def __init__( self.keep_alive = keep_alive self.logger = logger or logging.getLogger("OllamaJSONChat") self.context: Optional[List[int]] = None + self._last_meta: Dict[str, Any] = {} + self._schema_supported = False # Bootstrap the KV cache with the system prompt once. - # We don't care about the text reply here; we only keep the returned context. boot = self.client.generate( model=self.model_name, prompt=system_prompt, @@ -382,7 +515,6 @@ def __init__( self.context = boot.get("context") # Detect JSON schema support (best effort: try once without touching our context). - self._schema_supported = False if use_schema_format: try: _ = self.client.generate( @@ -407,6 +539,8 @@ def ask_json( Ask a single question. Only the new instruction is sent; the previous state is carried via `context`. Returns the raw text from `response`. + + Also captures Ollama's prompt_eval_count / eval_count in self._last_meta. """ kwargs = dict( model=self.model_name, @@ -423,6 +557,17 @@ def ask_json( res = self.client.generate(**kwargs) # Persist updated KV context self.context = res.get("context", self.context) + + # Capture perf metadata from Ollama response. + # Typical keys: prompt_eval_count, eval_count, total_duration, etc. + self._last_meta = { + "prompt_eval_count": res.get("prompt_eval_count"), + "eval_count": res.get("eval_count"), + "total_duration": res.get("total_duration"), + "prompt_eval_duration": res.get("prompt_eval_duration"), + "eval_duration": res.get("eval_duration"), + } + return res.get("response", "") # generate() returns 'response' @@ -473,10 +618,17 @@ def run_query_model_speed_up( tqdm_position: int = 0, client: Optional[ollama.Client] = None, # NEW: pass the ollama client here chat_prompts: Literal["my", "optimized"] = "my", + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB ) -> List[Tuple[str, Any]]: """ Faster version: use Ollama chat 'context' to avoid re-sending the whole chat every turn, and seed each sequence with a small snippet instead of the full article. + + NEW: + - We track timing & token counts across the entire pass (all sequences/questions). + - We emit .perf.json sidecars next to generated .json/.log.json. + - We mirror those metrics into SQLite for continuation / benchmarking. """ if client is None: raise ValueError( @@ -999,6 +1151,12 @@ def run_query_model_speed_up( answers_log: List[Dict[str, Any]] = [] described_sequences: List[Tuple[str, Dict[str, Any]]] = [] + # PERF tracking for the entire sequence-descriptor pass: + perf_start_dt = datetime.now(timezone.utc) + agg_prompt_tokens = 0 + agg_completion_tokens = 0 + have_token_info = False + try: for seq in tqdm( sequences, @@ -1017,7 +1175,6 @@ def run_query_model_speed_up( + article_text + "\n
\n" + f"And the most relevant snippet seems to be \nsnippet\n\n\n" - # + f"You MUST base answers ONLY on this article snipet: \nsnippet\n\n\n" + "The candidate for being a probe sequence is:\n\n" + seq + "\n\nAnd you must bow work with only this sequence and all relevant context for it. You will be asked a series of questions about this sequence.\n" @@ -1048,14 +1205,25 @@ def run_query_model_speed_up( + "\nReturn ONLY valid JSON matching this schema:\n" + json.dumps(schema, ensure_ascii=False) ) - raw_json = chat.ask_json(user_msg, schema=schema) + q_raw_json = chat.ask_json(user_msg, schema=schema) # Best-effort repair + parse - fixed = repair_json(raw_json) + fixed = repair_json(q_raw_json) obj = json.loads(fixed) + # PERF gather from last_meta + meta = getattr(chat, "_last_meta", {}) or {} + pt = meta.get("prompt_eval_count") + ct = meta.get("eval_count") + if pt is not None or ct is not None: + have_token_info = True + if pt is not None: + agg_prompt_tokens += pt + if ct is not None: + agg_completion_tokens += ct + # Persist logs with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> {query}\n< {raw_json}\n\n") + f.write(f"> {query}\n< {q_raw_json}\n\n") validator = Draft202012Validator(schema) errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) @@ -1071,10 +1239,19 @@ def run_query_model_speed_up( pass if probable_value is not None: validator_easy = Draft202012Validator(schema) - errors_easy = sorted(validator_easy.iter_errors(probable_value), key=lambda er: er.path) + errors_easy = sorted( + validator_easy.iter_errors(probable_value), + key=lambda er: er.path, + ) if not errors_easy: - with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> FIX_EASY\n< {probable_value}\n\n") + with open( + raw_txt_path, + mode="at", + encoding="utf-8", + ) as f: + f.write( + f"> FIX_EASY\n< {probable_value}\n\n" + ) answers_log.append( { "sequence": seq, @@ -1087,14 +1264,13 @@ def run_query_model_speed_up( continue except Exception as e: logger.exception("Failed to easily-fix an object") - # fix_query = f"There was a task: {query} on which the LLM produced an output:\n```json\n{raw_json}\n```. Please, rewrite it to satisfy the given schema format:\n```json\n{json.dumps(schema)}\nReturn null if and only if there is not enough data and provided data is insufficient for inferring the request.```." - # fix_query = f"Rewrite the object {raw_json} in the new schema. Return null if and only if there is not enough data and provided data is insufficient for inferring the request.```." + # Fallback path using outlines for schema repair fix_chat = outlines.inputs.Chat() fix_chat.add_system_message( prompt + f"\nIn this chat you have to transform the user-provided JSON object to match the following schema:\n```json\n{json.dumps(schema)}\n```\n. If user provided-data is not enough to fill-in some fields, put null value in them, but try harder to transform as much data to the new schema as possible. Please do not modify or invent values by yourself. Just move existing values to the corresponging fields of the schema. Please be thoughtful and careful while doing so!" ) - fix_chat.add_user_message(raw_json) + fix_chat.add_user_message(q_raw_json) try: format_fixed_raw_json = think_generate( model=model, @@ -1109,12 +1285,13 @@ def run_query_model_speed_up( f"Error on model {model.model_name}, sequence {seq}, query {query} and prompts {chat_prompts}" ) print("", flush=True) - format_fixed_raw_json = raw_json + format_fixed_raw_json = q_raw_json # Persist logs - # msgs = '\n'.join(map(lambda k,v: "\n".join([f"{k}: {v}"]), fix_chat.messages)) with open(raw_txt_path, mode="at", encoding="utf-8") as f: - f.write(f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n") + f.write( + f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n" + ) format_fixed = repair_json(format_fixed_raw_json) fixed_obj = json.loads(format_fixed) @@ -1143,6 +1320,7 @@ def run_query_model_speed_up( described_sequences.append((seq, seq_desc)) finally: + # Write log + output JSONs (original behavior) json_log_path.write_text( json.dumps(answers_log, indent=2, ensure_ascii=False), encoding="utf-8" ) @@ -1152,6 +1330,59 @@ def run_query_model_speed_up( ), encoding="utf-8", ) + + # PERF sidecar + DB insert for this pass. + perf_end_dt = datetime.now(timezone.utc) + # token stats aggregated over Q&A: + prompt_tokens = agg_prompt_tokens if have_token_info else None + completion_tokens = agg_completion_tokens if have_token_info else None + + # pick user-facing pass label + pass_label = ( + "SeqDesc-OPTIM" if chat_prompts == "optimized" else "SeqDesc-MY" + ) + + if article_name is None: + article_name = article_stem + + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=pass_label, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + db_path=db_path, + logger=logger, + notes=f"{pass_label} per-article descriptor map", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + + try: + _write_perf_sidecar_and_db( + artifact_path=json_log_path, + pass_name=pass_label, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + db_path=db_path, + logger=logger, + notes=f"{pass_label} Q&A log", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_log_path}: {repr(e)}" + ) + return described_sequences @@ -1202,8 +1433,18 @@ def run_single_pass( logger: logging.Logger, ollama_parameters: Dict[str, Any], model_name: str, + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB ) -> Dict[str, Any]: - """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + """Run one pass (schema+prompt from files), save raw+json+log, return object. + + NEW: + - We track timing for this single pass. + - We emit .perf.json containing timing & token stats. + - We mirror those metrics to SQLite via insert_pipeline_artifact(). + """ + perf_start_dt = datetime.now(timezone.utc) + txt_dir = out_base / "txt" json_dir = out_base / "json" log_dir = out_base / "logs" @@ -1275,6 +1516,30 @@ def run_single_pass( json_out_path.write_text( json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" ) + + # PERF sidecar + DB insert + perf_end_dt = datetime.now(timezone.utc) + if article_name is None: + article_name = article_stem + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=pass_cfg.name, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=None, # Outlines doesn't expose token counts directly + completion_tokens=None, + db_path=db_path, + logger=logger, + notes=f"{pass_cfg.name} single-pass extraction", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + return obj @@ -1290,8 +1555,16 @@ def run_construct_single_experiment_pass( logger: logging.Logger, ollama_parameters: Dict[str, Any], model_name: str, + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB ) -> Dict[str, Any]: - """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + """Run one pass (schema+prompt from files), save raw+json+log, return object. + + NEW: + - Perf timing + sidecar + DB artifact row (per sequence_id). + """ + perf_start_dt = datetime.now(timezone.utc) + txt_dir = out_base / "txt" json_dir = out_base / "json" log_dir = out_base / "logs" @@ -1393,6 +1666,30 @@ def run_construct_single_experiment_pass( json_out_path.write_text( json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" ) + + # PERF sidecar + DB insert + perf_end_dt = datetime.now(timezone.utc) + if article_name is None: + article_name = article_stem + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=f"{pass_cfg.name}__{sequence_id}", + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=db_path, + logger=logger, + notes=f"{pass_cfg.name} construct_single_experiment_pass for sequence {sequence_id}", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + return obj @@ -2148,8 +2445,19 @@ def _merge_from(pass_name: str, fields: List[str]): # ────────────────────────────────────────────────────────────────────── -def run_project(project_dir: str | Path) -> None: - """Run the pipeline as configured by files under project_dir.""" +def run_project(project_dir: str | Path, fresh: bool = False) -> None: + """Run the pipeline as configured by files under project_dir. + + NEW: + - fresh=False (default): continuation mode. We skip any article/model pair + that already has a successful "FULL" pass recorded in the DB + (hyb_db.pipeline_artifacts.success == 1 for pass_name "FULL"). + - fresh=True: force re-run everything. + + NOTE: + - Within a single interrupted article we may redo that article from scratch. + This is allowed per spec. + """ project_dir = Path(project_dir) cfg = load_pipeline_config(project_dir) @@ -2195,6 +2503,29 @@ def run_project(project_dir: str | Path) -> None: files, desc=f"Articles for model {model_name}", position=1, leave=False ): article_name = art_path.stem + + # CONTINUATION CHECK: + # If --fresh was not passed and db_path is configured, attempt to skip + # articles already fully processed for this model. + if (not fresh) and cfg.db_path: + try: + from hyb_db import get_completed_passes + + completed = get_completed_passes( + db_path=str(cfg.db_path), + model_name=model_name, + article_name=article_name, + ) + if "FULL" in completed: + logger.info( + f"[CONTINUE] Skipping {article_name} for model {model_name} (already FULL)." + ) + continue + except Exception: + logger.exception( + f"[CONTINUE] Continuation check failed for {article_name}:{model_name}; proceeding with full run." + ) + logger.info(f"=== {article_name} : {model_name} ===") article_text = art_path.read_text(encoding="utf-8") @@ -2217,6 +2548,8 @@ def run_project(project_dir: str | Path) -> None: logger=logger, ollama_parameters=cfg.ollama_parameters, model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, ) except Exception: logger.exception( @@ -2253,6 +2586,8 @@ def run_project(project_dir: str | Path) -> None: logger=logger, ollama_parameters=cfg.ollama_parameters, model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, ) except Exception: logger.exception( @@ -2272,6 +2607,8 @@ def run_project(project_dir: str | Path) -> None: tqdm_position=2, client=client, # <-- important: pass the raw ollama.Client chat_prompts="optimized", + db_path=cfg.db_path, + article_name=article_name, ) stamp = _now_stamp() @@ -2281,12 +2618,35 @@ def run_project(project_dir: str | Path) -> None: full_dir / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OPTIM__{stamp}.json" ) + # record perf for creating this full copy (aggregation-only, + # tokens not known here separately) + write_start_dt = datetime.now(timezone.utc) full_seq_desc_path.write_text( json.dumps( optimized_sequence_descriptors, indent=2, ensure_ascii=False ), encoding="utf-8", ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-OPTIM", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-OPTIM aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) try: # Optional DB insert @@ -2325,6 +2685,8 @@ def run_project(project_dir: str | Path) -> None: tqdm_position=2, client=client, # <-- important: pass the raw ollama.Client chat_prompts="my", + db_path=cfg.db_path, + article_name=article_name, ) stamp = _now_stamp() @@ -2334,10 +2696,31 @@ def run_project(project_dir: str | Path) -> None: full_dir / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-MY__{stamp}.json" ) + write_start_dt = datetime.now(timezone.utc) full_seq_desc_path.write_text( json.dumps(my_sequence_descriptors, indent=2, ensure_ascii=False), encoding="utf-8", ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-MY", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-MY aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) try: # Optional DB insert @@ -2387,10 +2770,31 @@ def run_project(project_dir: str | Path) -> None: full_dir / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OLD__{stamp}.json" ) + write_start_dt = datetime.now(timezone.utc) full_seq_desc_path.write_text( json.dumps(old_sequence_descriptors, indent=2, ensure_ascii=False), encoding="utf-8", ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-OLD", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-OLD aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) try: # Optional DB insert @@ -2428,10 +2832,31 @@ def run_project(project_dir: str | Path) -> None: full_dir / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-FULL__{stamp}.json" ) + write_start_dt = datetime.now(timezone.utc) full_seq_desc_path.write_text( json.dumps(sequence_descriptors, indent=2, ensure_ascii=False), encoding="utf-8", ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-FULL", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-FULL combined aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) for i, seq in enumerate( tqdm( @@ -2459,12 +2884,18 @@ def run_project(project_dir: str | Path) -> None: logger=logger, ollama_parameters=cfg.ollama_parameters, model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, ) except Exception: logger.exception( f"Pass failed: {p.name} : {article_name} : {model_name}" ) + # Prepare timing for final FULL object stitching+DB + full_start_dt = datetime.now(timezone.utc) + full_path = None + # Stitch only if the expected pass names are present try: A = outputs.get("A_core", {}) @@ -2556,10 +2987,48 @@ def run_project(project_dir: str | Path) -> None: f"[DB INSERT SEQDESC] stitching failed for {article_name} : {model_name}" ) + # PERF sidecar + DB artifact row for FULL artifact. + # Mark article/model as "done" for continuation purposes. + if full_path is not None: + full_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_path, + pass_name="FULL", + model_name=model_name, + article_name=article_name, + start_time=full_start_dt, + end_time=full_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="Final stitched article object + DB inserts", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_path}: {repr(e)}" + ) + # Optional CLI hook (project_dir arg) if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python pipeline_filedriven.py ") - sys.exit(1) - run_project(sys.argv[1]) + # Updated CLI to support --fresh + import argparse + + parser = argparse.ArgumentParser( + description="Run the hybridization extraction pipeline." + ) + parser.add_argument( + "project_dir", + help="Path to the project directory containing config/, passes/, inputs/, etc.", + ) + parser.add_argument( + "--fresh", + action="store_true", + help="Disable continuation / resume. Re-run all articles even if previously completed (pass 'FULL' already recorded).", + ) + + args = parser.parse_args() + run_project(args.project_dir, fresh=args.fresh) From 5ac2aafd105404e46b9f608689db65cae176b84e Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 31 Oct 2025 01:34:27 +0400 Subject: [PATCH 099/102] Added more models and steps for benchmark --- extraction/config/pipeline.json | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 9df16ec..917bac8 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -1,7 +1,13 @@ { "model_names": [ + "myaniu/qwen2.5-1m:7b", + "deepseek-r1:1.5b", + "qwen2.5-coder:3b", "phi4-mini-reasoning:latest", "gemma3:4b", + "phi3:latest", + "llama3.1:latest", + "myaniu/qwen2.5-1m:14b", "phi4:14b", "gemma3:27b" ], @@ -12,7 +18,7 @@ "seed": 42 }, "ollama_base_url": "http://127.0.0.1:11434", - "timeout_s": 45, + "timeout_s": 60, "__input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", "_input_dir": "input/md", @@ -41,9 +47,28 @@ "schema": "passes/_1_SeqPrompt/schema_strict.json", "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", "timeout": 60 + }, + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + } + ], + "construct_single_experiment_passes": [ + { + "name": "_4_ConstructSingleSmallExperiment", + "schema": "passes/_4_ConstructSingleSmallExperiment/schema.json", + "prompt": "passes/_4_ConstructSingleSmallExperiment/prompt.txt", + "timeout": 60 + }, + { + "name": "_6_ConstructSingleSequenceExperimentAndOutcome", + "schema": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json", + "prompt": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt", + "timeout": 60 } ], - "construct_single_experiment_passes": [], "passes": [ { "name": "A_core", From 5f4d2794e88e7d5400b384900f68323834ef825a Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 31 Oct 2025 01:41:15 +0400 Subject: [PATCH 100/102] Changed input directory to benchmarks in markdown --- extraction/config/pipeline.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json index 917bac8..06a36b5 100644 --- a/extraction/config/pipeline.json +++ b/extraction/config/pipeline.json @@ -20,7 +20,8 @@ "ollama_base_url": "http://127.0.0.1:11434", "timeout_s": 60, "__input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", - "input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", + "___input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", + "input_dir": "/mnt/Models/articles2_marker/no_llm/bench_md/", "_input_dir": "input/md", "out_dir": "outlines_output_db", "full_schema_path": "schema/json/article.json", From 778ecc2e4cf073556220a166cda572f5b12aa92f Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 12 Dec 2025 17:16:23 +0400 Subject: [PATCH 101/102] Add MIT license header to Python files --- app/app.py | 26 +++ app/database.py | 26 +++ benchmark/make_primer3_template.py | 26 +++ benchmark/parse_blast_stats.py | 26 +++ extraction/extract_articles.py | 26 +++ extraction/hyb_db.py | 26 +++ extraction/pipeline_pre_quest.py | 26 +++ pipeline.py | 26 +++ scripts/articles/fetch_article_text.py | 26 +++ scripts/benchmarking/test_data_gen.py | 26 +++ scripts/databases/generate_noisy_probes.py | 26 +++ scripts/databases/probeBase.py | 144 ++++++++++------- scripts/databases/probeBase_parse.py | 26 +++ scripts/databases/probeBase_wide.py | 26 +++ scripts/generator/ML_filtration.py | 26 +++ scripts/generator/probe_filt.py | 26 +++ scripts/grid_search/test_parameters.py | 26 +++ scripts/loop_generation/download_genomes.py | 26 +++ scripts/loop_generation/merge_outputs.py | 26 +++ scripts/validation/pdf_to_seq.py | 26 +++ scripts/validation/validation.py | 26 +++ setup.py | 26 +++ src/PROBESt/AI.py | 26 +++ src/PROBESt/__init__.py | 26 +++ src/PROBESt/args.py | 26 +++ src/PROBESt/bash_wrappers.py | 26 +++ src/PROBESt/check_probe_pdf.py | 26 +++ src/PROBESt/dedegeneration.py | 26 +++ src/PROBESt/evolution.py | 26 +++ src/PROBESt/filtration.py | 26 +++ src/PROBESt/genome_operations.py | 26 +++ src/PROBESt/merge.py | 26 +++ src/PROBESt/misc.py | 26 +++ src/PROBESt/modeling.py | 26 +++ src/PROBESt/models_registry.py | 26 +++ src/PROBESt/oligominer.py | 26 +++ src/PROBESt/prepare_blast.py | 26 +++ src/PROBESt/primer3.py | 26 +++ src/PROBESt/probe_alignment_profiler.py | 26 +++ src/PROBESt/rna_structure.py | 26 +++ src/PROBESt/tokenization.py | 26 +++ src/conf.py | 26 +++ tests/PROBESt/test_AI.py | 26 +++ tests/PROBESt/test_dedegeneration.py | 26 +++ tests/PROBESt/test_filtration.py | 26 +++ tests/PROBESt/test_genome_operations.py | 26 +++ tests/PROBESt/test_merge.py | 26 +++ tests/PROBESt/test_misc.py | 26 +++ tests/PROBESt/test_modeling.py | 26 +++ tests/PROBESt/test_oligominer.py | 26 +++ tests/PROBESt/test_prepare_blast.py | 26 +++ tests/PROBESt/test_primer3.py | 26 +++ .../PROBESt/test_probe_alignment_profiler.py | 26 +++ tests/PROBESt/test_rna_structure.py | 26 +++ tests/__init__.py | 26 +++ tests/conftest.py | 26 +++ tests/database/test_probebase.py | 152 ++++++++++-------- tests/scripts/fasta2table.py | 26 +++ tests/scripts/prep_db.py | 26 +++ tests/scripts/probe_check.py | 26 +++ 60 files changed, 1682 insertions(+), 122 deletions(-) diff --git a/app/app.py b/app/app.py index 635ed2d..2942304 100644 --- a/app/app.py +++ b/app/app.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import sys import re diff --git a/app/database.py b/app/database.py index a3bdb92..5bb694d 100644 --- a/app/database.py +++ b/app/database.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ PROBESt Database Search Application diff --git a/benchmark/make_primer3_template.py b/benchmark/make_primer3_template.py index 8aac9ac..247c792 100644 --- a/benchmark/make_primer3_template.py +++ b/benchmark/make_primer3_template.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import sys diff --git a/benchmark/parse_blast_stats.py b/benchmark/parse_blast_stats.py index 0cd7922..25cad7f 100644 --- a/benchmark/parse_blast_stats.py +++ b/benchmark/parse_blast_stats.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import pandas as pd import sys diff --git a/extraction/extract_articles.py b/extraction/extract_articles.py index 53eea8c..5158ef6 100644 --- a/extraction/extract_articles.py +++ b/extraction/extract_articles.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import re import json5 as json diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py index 6c3333a..2e5d0ac 100644 --- a/extraction/hyb_db.py +++ b/extraction/hyb_db.py @@ -1,4 +1,30 @@ # -*- coding: utf-8 -*- +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import outlines from outlines.types import JsonSchema import ollama diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py index 6818a7a..45589f8 100755 --- a/extraction/pipeline_pre_quest.py +++ b/extraction/pipeline_pre_quest.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # pipeline_filedriven.py # -*- coding: utf-8 -*- """ diff --git a/pipeline.py b/pipeline.py index 9dd9726..a98b7ec 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # Pipeline: PROBESt---- # 1. Initial set generation diff --git a/scripts/articles/fetch_article_text.py b/scripts/articles/fetch_article_text.py index ac9090b..a3e4b19 100644 --- a/scripts/articles/fetch_article_text.py +++ b/scripts/articles/fetch_article_text.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import sys import requests diff --git a/scripts/benchmarking/test_data_gen.py b/scripts/benchmarking/test_data_gen.py index 20e2d1a..76207c3 100644 --- a/scripts/benchmarking/test_data_gen.py +++ b/scripts/benchmarking/test_data_gen.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import argparse import json import pandas as pd diff --git a/scripts/databases/generate_noisy_probes.py b/scripts/databases/generate_noisy_probes.py index 715beba..6c90d85 100644 --- a/scripts/databases/generate_noisy_probes.py +++ b/scripts/databases/generate_noisy_probes.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd import numpy as np diff --git a/scripts/databases/probeBase.py b/scripts/databases/probeBase.py index 25a2daf..89b6871 100644 --- a/scripts/databases/probeBase.py +++ b/scripts/databases/probeBase.py @@ -1,60 +1,86 @@ -import requests -from bs4 import BeautifulSoup -import pandas as pd - - -def parse_probebase_page(url: str) -> pd.Series: - """Parse one page from ProbeBase database to the uniform format - - Parameters - ---------- - url : str - URL string, path to the probebase page - - Returns - ------- - table : pd.DataFrame - parsed table from probebase - """ - - # Download html and get table - - response = requests.get(url) - - # Response checking - if response.status_code == 200: - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(response.content, 'html.parser') - - # if table is empty - if soup.find_all('tr') == []: # CORRECT!!!!!!!!!!!!!!!!!!! - return Warning('Page without probe data.frame or parsing problems') - - # Create a list to hold the rows of the table - table_data = [] - - # Loop through each row in the table - for row in soup.find_all('tr'): - # print(row, "###############") - cells = row.find_all(['td', 'th', 'value']) - row_data = [cell.get_text(strip=True) for cell in cells] - table_data.append(row_data) - - # Convert the list of rows into a DataFrame for easier manipulation - df = pd.DataFrame(table_data) - - # Check if we have enough columns - if df.shape[1] < 2: - return Warning('Table has insufficient columns') - - df.iloc[0,0] = 'Test' - df.iloc[1,0] = 'Name' - - df2 = df.iloc[:,1] - df2.index = df.iloc[:,0] - - # Display the result - return df2 - - else: +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import requests +from bs4 import BeautifulSoup +import pandas as pd + + +def parse_probebase_page(url: str) -> pd.Series: + """Parse one page from ProbeBase database to the uniform format + + Parameters + ---------- + url : str + URL string, path to the probebase page + + Returns + ------- + table : pd.DataFrame + parsed table from probebase + """ + + # Download html and get table + + response = requests.get(url) + + # Response checking + if response.status_code == 200: + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # if table is empty + if soup.find_all('tr') == []: # CORRECT!!!!!!!!!!!!!!!!!!! + return Warning('Page without probe data.frame or parsing problems') + + # Create a list to hold the rows of the table + table_data = [] + + # Loop through each row in the table + for row in soup.find_all('tr'): + # print(row, "###############") + cells = row.find_all(['td', 'th', 'value']) + row_data = [cell.get_text(strip=True) for cell in cells] + table_data.append(row_data) + + # Convert the list of rows into a DataFrame for easier manipulation + df = pd.DataFrame(table_data) + + # Check if we have enough columns + if df.shape[1] < 2: + return Warning('Table has insufficient columns') + + df.iloc[0,0] = 'Test' + df.iloc[1,0] = 'Name' + + df2 = df.iloc[:,1] + df2.index = df.iloc[:,0] + + # Display the result + return df2 + + else: return ImportWarning(f"Failed to retrieve the page. Status code: {response.status_code}") \ No newline at end of file diff --git a/scripts/databases/probeBase_parse.py b/scripts/databases/probeBase_parse.py index 6667c99..fa91f97 100644 --- a/scripts/databases/probeBase_parse.py +++ b/scripts/databases/probeBase_parse.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd from tqdm import tqdm import re diff --git a/scripts/databases/probeBase_wide.py b/scripts/databases/probeBase_wide.py index 196eedd..9c5f6f4 100644 --- a/scripts/databases/probeBase_wide.py +++ b/scripts/databases/probeBase_wide.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd # Read the CSV file diff --git a/scripts/generator/ML_filtration.py b/scripts/generator/ML_filtration.py index 1e978ab..d0a9e52 100644 --- a/scripts/generator/ML_filtration.py +++ b/scripts/generator/ML_filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import pandas as pd import numpy as np diff --git a/scripts/generator/probe_filt.py b/scripts/generator/probe_filt.py index c2963b7..9fcecd6 100644 --- a/scripts/generator/probe_filt.py +++ b/scripts/generator/probe_filt.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # imports import sys import numpy as np diff --git a/scripts/grid_search/test_parameters.py b/scripts/grid_search/test_parameters.py index 7259b83..421b64f 100644 --- a/scripts/grid_search/test_parameters.py +++ b/scripts/grid_search/test_parameters.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + ''' Script for running the main pipeline with a given parameters grid to collect hit statistics. ''' diff --git a/scripts/loop_generation/download_genomes.py b/scripts/loop_generation/download_genomes.py index e79f104..10795d2 100644 --- a/scripts/loop_generation/download_genomes.py +++ b/scripts/loop_generation/download_genomes.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Download genomes for bacterial species from NCBI. Downloads up to 100 genomes per species. diff --git a/scripts/loop_generation/merge_outputs.py b/scripts/loop_generation/merge_outputs.py index 79fcc97..fbdf8dc 100644 --- a/scripts/loop_generation/merge_outputs.py +++ b/scripts/loop_generation/merge_outputs.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Merge modeling outputs from all species into a single file with species_name column. """ diff --git a/scripts/validation/pdf_to_seq.py b/scripts/validation/pdf_to_seq.py index f30ce12..28408cb 100644 --- a/scripts/validation/pdf_to_seq.py +++ b/scripts/validation/pdf_to_seq.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from pdfminer.high_level import extract_text import re import argparse diff --git a/scripts/validation/validation.py b/scripts/validation/validation.py index db93242..b7af24d 100644 --- a/scripts/validation/validation.py +++ b/scripts/validation/validation.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from jsonschema import validate, ValidationError import json import argparse diff --git a/setup.py b/setup.py index ff0b081..bcd5bc3 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from setuptools import setup, find_packages import os diff --git a/src/PROBESt/AI.py b/src/PROBESt/AI.py index 5cf8278..6fe99fb 100644 --- a/src/PROBESt/AI.py +++ b/src/PROBESt/AI.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression diff --git a/src/PROBESt/__init__.py b/src/PROBESt/__init__.py index 3b952fd..b283e9c 100644 --- a/src/PROBESt/__init__.py +++ b/src/PROBESt/__init__.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """PROBESt package.""" from . import genome_operations diff --git a/src/PROBESt/args.py b/src/PROBESt/args.py index e696f9b..a1deb6b 100644 --- a/src/PROBESt/args.py +++ b/src/PROBESt/args.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import argparse def arguments_parse(): diff --git a/src/PROBESt/bash_wrappers.py b/src/PROBESt/bash_wrappers.py index 1b46d0d..f03e2e2 100644 --- a/src/PROBESt/bash_wrappers.py +++ b/src/PROBESt/bash_wrappers.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import subprocess def uniline_fasta(args, out): diff --git a/src/PROBESt/check_probe_pdf.py b/src/PROBESt/check_probe_pdf.py index 5c6813e..8a8098c 100644 --- a/src/PROBESt/check_probe_pdf.py +++ b/src/PROBESt/check_probe_pdf.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ PDF Nucleotide Sequence Checker diff --git a/src/PROBESt/dedegeneration.py b/src/PROBESt/dedegeneration.py index a0c001b..11cd3d3 100644 --- a/src/PROBESt/dedegeneration.py +++ b/src/PROBESt/dedegeneration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ De-degeneration module for PROBESt. diff --git a/src/PROBESt/evolution.py b/src/PROBESt/evolution.py index 9d9b8ce..1c1ccf8 100644 --- a/src/PROBESt/evolution.py +++ b/src/PROBESt/evolution.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import random diff --git a/src/PROBESt/filtration.py b/src/PROBESt/filtration.py index 6a749fc..30f5652 100644 --- a/src/PROBESt/filtration.py +++ b/src/PROBESt/filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import pandas as pd import numpy as np diff --git a/src/PROBESt/genome_operations.py b/src/PROBESt/genome_operations.py index 1e2c8d6..2b3999f 100644 --- a/src/PROBESt/genome_operations.py +++ b/src/PROBESt/genome_operations.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Module for genome operations including fetching, BLAST search and parsing.""" from Bio import Entrez, SeqIO diff --git a/src/PROBESt/merge.py b/src/PROBESt/merge.py index 600fcc4..16f84bb 100644 --- a/src/PROBESt/merge.py +++ b/src/PROBESt/merge.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import subprocess from shutil import copyfile diff --git a/src/PROBESt/misc.py b/src/PROBESt/misc.py index 0966726..0ae014d 100644 --- a/src/PROBESt/misc.py +++ b/src/PROBESt/misc.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import re import subprocess diff --git a/src/PROBESt/modeling.py b/src/PROBESt/modeling.py index 0c22b13..7b8400f 100644 --- a/src/PROBESt/modeling.py +++ b/src/PROBESt/modeling.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Modeling module for PROBESt pipeline. diff --git a/src/PROBESt/models_registry.py b/src/PROBESt/models_registry.py index 4c74497..68dd262 100644 --- a/src/PROBESt/models_registry.py +++ b/src/PROBESt/models_registry.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import torch import torch.nn as nn from torch.nn import functional as F diff --git a/src/PROBESt/oligominer.py b/src/PROBESt/oligominer.py index b1ddc3b..f492c3f 100644 --- a/src/PROBESt/oligominer.py +++ b/src/PROBESt/oligominer.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import subprocess diff --git a/src/PROBESt/prepare_blast.py b/src/PROBESt/prepare_blast.py index d0e2a56..281d4f2 100644 --- a/src/PROBESt/prepare_blast.py +++ b/src/PROBESt/prepare_blast.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Module for preparing BLAST databases from FASTA directories. diff --git a/src/PROBESt/primer3.py b/src/PROBESt/primer3.py index 263a314..bde344b 100644 --- a/src/PROBESt/primer3.py +++ b/src/PROBESt/primer3.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import subprocess diff --git a/src/PROBESt/probe_alignment_profiler.py b/src/PROBESt/probe_alignment_profiler.py index d226ace..5495465 100644 --- a/src/PROBESt/probe_alignment_profiler.py +++ b/src/PROBESt/probe_alignment_profiler.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Module for analyzing BLAST alignments of probe-target sequence pairs. This module provides functionality to process BLAST alignment results and calculate diff --git a/src/PROBESt/rna_structure.py b/src/PROBESt/rna_structure.py index 53ebbc5..004cd19 100644 --- a/src/PROBESt/rna_structure.py +++ b/src/PROBESt/rna_structure.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import RNA from typing import Union, Tuple, Optional diff --git a/src/PROBESt/tokenization.py b/src/PROBESt/tokenization.py index 242c06f..97b1ae2 100644 --- a/src/PROBESt/tokenization.py +++ b/src/PROBESt/tokenization.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tokenization module for DNA sequences. This module provides functions to tokenize DNA sequences into k-mers, diff --git a/src/conf.py b/src/conf.py index f3b490b..31db77f 100644 --- a/src/conf.py +++ b/src/conf.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: diff --git a/tests/PROBESt/test_AI.py b/tests/PROBESt/test_AI.py index fa9246d..9b76932 100644 --- a/tests/PROBESt/test_AI.py +++ b/tests/PROBESt/test_AI.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_dedegeneration.py b/tests/PROBESt/test_dedegeneration.py index d64f5a2..76abfd4 100644 --- a/tests/PROBESt/test_dedegeneration.py +++ b/tests/PROBESt/test_dedegeneration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for the de-degeneration module.""" import pytest diff --git a/tests/PROBESt/test_filtration.py b/tests/PROBESt/test_filtration.py index c5c8207..bbd2f1d 100644 --- a/tests/PROBESt/test_filtration.py +++ b/tests/PROBESt/test_filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_genome_operations.py b/tests/PROBESt/test_genome_operations.py index fbeaa92..28f6c7d 100644 --- a/tests/PROBESt/test_genome_operations.py +++ b/tests/PROBESt/test_genome_operations.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for genome operations module.""" import pytest diff --git a/tests/PROBESt/test_merge.py b/tests/PROBESt/test_merge.py index 40b5329..462b308 100644 --- a/tests/PROBESt/test_merge.py +++ b/tests/PROBESt/test_merge.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import subprocess diff --git a/tests/PROBESt/test_misc.py b/tests/PROBESt/test_misc.py index f256a89..58e9f02 100644 --- a/tests/PROBESt/test_misc.py +++ b/tests/PROBESt/test_misc.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_modeling.py b/tests/PROBESt/test_modeling.py index 1f36696..073919a 100644 --- a/tests/PROBESt/test_modeling.py +++ b/tests/PROBESt/test_modeling.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for modeling module.""" import pytest diff --git a/tests/PROBESt/test_oligominer.py b/tests/PROBESt/test_oligominer.py index 62056b5..1483e18 100644 --- a/tests/PROBESt/test_oligominer.py +++ b/tests/PROBESt/test_oligominer.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import sys diff --git a/tests/PROBESt/test_prepare_blast.py b/tests/PROBESt/test_prepare_blast.py index 8f62e43..be9be01 100644 --- a/tests/PROBESt/test_prepare_blast.py +++ b/tests/PROBESt/test_prepare_blast.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for prepare_blast module.""" import pytest diff --git a/tests/PROBESt/test_primer3.py b/tests/PROBESt/test_primer3.py index 4ddca8a..ada2de6 100644 --- a/tests/PROBESt/test_primer3.py +++ b/tests/PROBESt/test_primer3.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import sys diff --git a/tests/PROBESt/test_probe_alignment_profiler.py b/tests/PROBESt/test_probe_alignment_profiler.py index c322df9..8616ccd 100644 --- a/tests/PROBESt/test_probe_alignment_profiler.py +++ b/tests/PROBESt/test_probe_alignment_profiler.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for probe alignment profiler.""" import pytest diff --git a/tests/PROBESt/test_rna_structure.py b/tests/PROBESt/test_rna_structure.py index 9abbb47..327ff6f 100644 --- a/tests/PROBESt/test_rna_structure.py +++ b/tests/PROBESt/test_rna_structure.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest from src.PROBESt.rna_structure import calculate_hairpin_prob, calculate_dimer_G, get_reverse_complement diff --git a/tests/__init__.py b/tests/__init__.py index df825a5..06bd778 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,27 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Test package for PROBESt.""" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index ecd143a..122c259 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Common pytest configurations and fixtures.""" import pytest diff --git a/tests/database/test_probebase.py b/tests/database/test_probebase.py index 193f189..ddc513d 100644 --- a/tests/database/test_probebase.py +++ b/tests/database/test_probebase.py @@ -1,64 +1,90 @@ -"""Tests for ProbeBase database parsing.""" - -import pytest -import pandas as pd -from unittest.mock import patch, MagicMock -from scripts.databases.probeBase import parse_probebase_page - -@pytest.fixture -def mock_response(): - """Create a mock response object.""" - mock = MagicMock() - mock.status_code = 200 - return mock - -@patch('requests.get') -def test_response_problem(mock_get, mock_response): - """Test page with response problem.""" - # Setup mock response - mock_response.content = b"Error page" - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe" - resp = parse_probebase_page(data) - assert isinstance(resp, Warning) - -@patch('requests.get') -def test_response_empty(mock_get, mock_response): - """Test page with empty table.""" - # Setup mock response with empty table - mock_response.content = b""" - - - - - -
Header
No data
- - - """ - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe/1" - resp = parse_probebase_page(data) - assert isinstance(resp, Warning) - -@patch('requests.get') -def test_response_content(mock_get, mock_response): - """Test page with content.""" - # Setup mock response with valid table data - mock_response.content = b""" - - - - - -
Probe IDSequence
1ATGC
- - - """ - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe/2" - resp = parse_probebase_page(data) +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +"""Tests for ProbeBase database parsing.""" + +import pytest +import pandas as pd +from unittest.mock import patch, MagicMock +from scripts.databases.probeBase import parse_probebase_page + +@pytest.fixture +def mock_response(): + """Create a mock response object.""" + mock = MagicMock() + mock.status_code = 200 + return mock + +@patch('requests.get') +def test_response_problem(mock_get, mock_response): + """Test page with response problem.""" + # Setup mock response + mock_response.content = b"Error page" + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe" + resp = parse_probebase_page(data) + assert isinstance(resp, Warning) + +@patch('requests.get') +def test_response_empty(mock_get, mock_response): + """Test page with empty table.""" + # Setup mock response with empty table + mock_response.content = b""" + + + + + +
Header
No data
+ + + """ + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe/1" + resp = parse_probebase_page(data) + assert isinstance(resp, Warning) + +@patch('requests.get') +def test_response_content(mock_get, mock_response): + """Test page with content.""" + # Setup mock response with valid table data + mock_response.content = b""" + + + + + +
Probe IDSequence
1ATGC
+ + + """ + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe/2" + resp = parse_probebase_page(data) assert isinstance(resp, pd.Series) \ No newline at end of file diff --git a/tests/scripts/fasta2table.py b/tests/scripts/fasta2table.py index 241f5b2..aa30fe7 100644 --- a/tests/scripts/fasta2table.py +++ b/tests/scripts/fasta2table.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess diff --git a/tests/scripts/prep_db.py b/tests/scripts/prep_db.py index 728c00f..90b70a1 100644 --- a/tests/scripts/prep_db.py +++ b/tests/scripts/prep_db.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess diff --git a/tests/scripts/probe_check.py b/tests/scripts/probe_check.py index f78621a..085d0fe 100644 --- a/tests/scripts/probe_check.py +++ b/tests/scripts/probe_check.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess From 02b61bacd6746693e49603c2061d18d8c1e19b3f Mon Sep 17 00:00:00 2001 From: Alexander Serdyukov Date: Fri, 12 Dec 2025 17:26:18 +0400 Subject: [PATCH 102/102] Added README for the extraction module --- extraction/README.MD | 114 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 extraction/README.MD diff --git a/extraction/README.MD b/extraction/README.MD new file mode 100644 index 0000000..f1749fa --- /dev/null +++ b/extraction/README.MD @@ -0,0 +1,114 @@ +# Extraction submodule + +This folder contains a **file-driven, multi-pass extraction pipeline** built on **Outlines** (JSON-guided generation) and **Ollama** (local model serving). The pipeline reads configuration, prompts, and JSON Schemas from disk, runs a configured sequence of passes, writes versioned artifacts to an output directory (never overwriting), and can optionally persist results + timing metadata into an SQLite database. + +## What it does + +- **Pass-based extraction**: runs a configurable sequence of passes (e.g., `A_core`, `B_index`, `C_sequences`, ...) using Outlines JSON schema guidance. +- **Artifacts on disk**: writes raw text, pretty JSON, and error logs for each pass without overwriting prior runs. +- **Final stitching + validation**: stitches per-pass outputs into a final “FULL” object and can validate it against a “full schema” if configured. +- **SQLite optional**: can insert stitched results into SQLite via `hyb_db.insert_article_object(...)`. +- **Perf sidecars + continuation**: each JSON artifact can have a `*.perf.json` sidecar; the same metrics can also be mirrored into SQLite (`pipeline_artifacts`) and used for “resume” mode. + +## Repository layout + +The pipeline expects a “project directory” that contains: + +- `config/pipeline.json` (main configuration) +- `passes//schema.json` and `passes//prompt.txt` (per-pass assets) +- `passes/common.txt` (shared prompt prefix, optional) +- `schema/json/article.json` (full schema for final validation, optional) +- input directory with source files (configured in `pipeline.json`) + +The config shown in `config/pipeline.json` includes keys such as: +- `model_names`, `ollama_base_url`, `ollama_parameters`, `timeout_s` +- `input_dir`, `out_dir`, `article_glob` +- `pre_passes`, `construct_single_experiment_passes`, `passes` + +## Installation + +Python dependencies (minimum set used by the pipeline): + +```bash +pip install -r requirements.txt +``` + +Or use the conda/mamba to initialize environment from `environment.yml`. + +You also need: +- **Ollama** running locally (or reachable over HTTP), matching `ollama_base_url` in `config/pipeline.json`. + +Optional: +- If `db_path` is set in config, SQLite will be used and schema will be auto-created. + +### Environment variables + +- `OPEN_BUTTON_TOKEN` (optional): if set, it is passed as a Bearer token in Ollama client headers. + +## How to run + +### 1) Configure `config/pipeline.json` + +Edit paths to match your machine. In the attached example, `input_dir` is set to an absolute path and `article_glob` uses a recursive pattern. + +Key fields you typically tune: +- `model_names`: list of Ollama model identifiers to run. +- `ollama_parameters`: e.g. `num_ctx`, `num_predict`, `temperature`, `seed`. +- `timeout_s`, `ollama_base_url` +- `out_dir`, `db_path` + +### 2) Run the pipeline + +#### CLI + +From the repository root (or anywhere, as long as you pass the correct project directory): + +```bash +python extraction/pipeline_filedriven.py extraction --fresh +``` + +- `project_dir` is the folder containing `config/`, `passes/`, etc. +- omit `--fresh` to enable continuation/resume behavior. + +#### Python + +```python +from extraction.pipeline_filedriven import run_project +run_project("extraction", fresh=False) +``` + +## Outputs + +Artifacts are written under `out_dir` (from `pipeline.json`). + +The pipeline writes, per pass and per model/article: +- raw text: `*.txt` +- JSON outputs: `*.json` +- log JSON: `*.log.json` +- error logs: `logs/*.log` +- perf sidecars: `*.perf.json` (one per emitted JSON artifact) + +Perf sidecars include timestamps, wallclock duration, and (when Ollama reports it) token counts. + +## Continuation / resume mode + +When `db_path` is configured, the pipeline can skip already completed work: + +- default `fresh=False`: for each `(model_name, article_name)`, if a successful `pass_name="FULL"` is recorded in the DB, the article can be skipped. +- `--fresh`: disables skipping and forces re-processing. + +Implementation note: +- completion is tracked in `pipeline_artifacts` and queried via `hyb_db.get_completed_passes(...)`. + +## Database schema (optional) + +If `db_path` is set, `hyb_db` auto-creates tables and views and inserts: +- stitched article objects (`insert_article_object`) +- artifact-level perf bookkeeping (`pipeline_artifacts`) + +## Overall design (short) + +- **Config-first**: a project is a directory of config + prompts + schemas, making experiments easy to reproduce and version-control. +- **Multi-pass extraction**: each pass targets a specific sub-problem and produces a structured JSON artifact. +- **Immutable artifacts**: outputs are timestamped and never overwritten, enabling auditing and comparisons across runs. +- **Optional persistence**: results and metrics can be stored in SQLite for analysis and “resume” behavior.