diff --git a/app/app.py b/app/app.py index 635ed2d..2942304 100644 --- a/app/app.py +++ b/app/app.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import sys import re diff --git a/app/database.py b/app/database.py index a3bdb92..5bb694d 100644 --- a/app/database.py +++ b/app/database.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ PROBESt Database Search Application diff --git a/benchmark/make_primer3_template.py b/benchmark/make_primer3_template.py index 8aac9ac..247c792 100644 --- a/benchmark/make_primer3_template.py +++ b/benchmark/make_primer3_template.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import sys diff --git a/benchmark/parse_blast_stats.py b/benchmark/parse_blast_stats.py index 0cd7922..25cad7f 100644 --- a/benchmark/parse_blast_stats.py +++ b/benchmark/parse_blast_stats.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import pandas as pd import sys diff --git a/extraction/README.MD b/extraction/README.MD new file mode 100644 index 0000000..f1749fa --- /dev/null +++ b/extraction/README.MD @@ -0,0 +1,114 @@ +# Extraction submodule + +This folder contains a **file-driven, multi-pass extraction pipeline** built on **Outlines** (JSON-guided generation) and **Ollama** (local model serving). The pipeline reads configuration, prompts, and JSON Schemas from disk, runs a configured sequence of passes, writes versioned artifacts to an output directory (never overwriting), and can optionally persist results + timing metadata into an SQLite database. + +## What it does + +- **Pass-based extraction**: runs a configurable sequence of passes (e.g., `A_core`, `B_index`, `C_sequences`, ...) using Outlines JSON schema guidance. +- **Artifacts on disk**: writes raw text, pretty JSON, and error logs for each pass without overwriting prior runs. +- **Final stitching + validation**: stitches per-pass outputs into a final “FULL” object and can validate it against a “full schema” if configured. +- **SQLite optional**: can insert stitched results into SQLite via `hyb_db.insert_article_object(...)`. +- **Perf sidecars + continuation**: each JSON artifact can have a `*.perf.json` sidecar; the same metrics can also be mirrored into SQLite (`pipeline_artifacts`) and used for “resume” mode. + +## Repository layout + +The pipeline expects a “project directory” that contains: + +- `config/pipeline.json` (main configuration) +- `passes//schema.json` and `passes//prompt.txt` (per-pass assets) +- `passes/common.txt` (shared prompt prefix, optional) +- `schema/json/article.json` (full schema for final validation, optional) +- input directory with source files (configured in `pipeline.json`) + +The config shown in `config/pipeline.json` includes keys such as: +- `model_names`, `ollama_base_url`, `ollama_parameters`, `timeout_s` +- `input_dir`, `out_dir`, `article_glob` +- `pre_passes`, `construct_single_experiment_passes`, `passes` + +## Installation + +Python dependencies (minimum set used by the pipeline): + +```bash +pip install -r requirements.txt +``` + +Or use the conda/mamba to initialize environment from `environment.yml`. + +You also need: +- **Ollama** running locally (or reachable over HTTP), matching `ollama_base_url` in `config/pipeline.json`. + +Optional: +- If `db_path` is set in config, SQLite will be used and schema will be auto-created. + +### Environment variables + +- `OPEN_BUTTON_TOKEN` (optional): if set, it is passed as a Bearer token in Ollama client headers. + +## How to run + +### 1) Configure `config/pipeline.json` + +Edit paths to match your machine. In the attached example, `input_dir` is set to an absolute path and `article_glob` uses a recursive pattern. + +Key fields you typically tune: +- `model_names`: list of Ollama model identifiers to run. +- `ollama_parameters`: e.g. `num_ctx`, `num_predict`, `temperature`, `seed`. +- `timeout_s`, `ollama_base_url` +- `out_dir`, `db_path` + +### 2) Run the pipeline + +#### CLI + +From the repository root (or anywhere, as long as you pass the correct project directory): + +```bash +python extraction/pipeline_filedriven.py extraction --fresh +``` + +- `project_dir` is the folder containing `config/`, `passes/`, etc. +- omit `--fresh` to enable continuation/resume behavior. + +#### Python + +```python +from extraction.pipeline_filedriven import run_project +run_project("extraction", fresh=False) +``` + +## Outputs + +Artifacts are written under `out_dir` (from `pipeline.json`). + +The pipeline writes, per pass and per model/article: +- raw text: `*.txt` +- JSON outputs: `*.json` +- log JSON: `*.log.json` +- error logs: `logs/*.log` +- perf sidecars: `*.perf.json` (one per emitted JSON artifact) + +Perf sidecars include timestamps, wallclock duration, and (when Ollama reports it) token counts. + +## Continuation / resume mode + +When `db_path` is configured, the pipeline can skip already completed work: + +- default `fresh=False`: for each `(model_name, article_name)`, if a successful `pass_name="FULL"` is recorded in the DB, the article can be skipped. +- `--fresh`: disables skipping and forces re-processing. + +Implementation note: +- completion is tracked in `pipeline_artifacts` and queried via `hyb_db.get_completed_passes(...)`. + +## Database schema (optional) + +If `db_path` is set, `hyb_db` auto-creates tables and views and inserts: +- stitched article objects (`insert_article_object`) +- artifact-level perf bookkeeping (`pipeline_artifacts`) + +## Overall design (short) + +- **Config-first**: a project is a directory of config + prompts + schemas, making experiments easy to reproduce and version-control. +- **Multi-pass extraction**: each pass targets a specific sub-problem and produces a structured JSON artifact. +- **Immutable artifacts**: outputs are timestamped and never overwritten, enabling auditing and comparisons across runs. +- **Optional persistence**: results and metrics can be stored in SQLite for analysis and “resume” behavior. diff --git a/extraction/config/pipeline.json b/extraction/config/pipeline.json new file mode 100644 index 0000000..06a36b5 --- /dev/null +++ b/extraction/config/pipeline.json @@ -0,0 +1,203 @@ +{ + "model_names": [ + "myaniu/qwen2.5-1m:7b", + "deepseek-r1:1.5b", + "qwen2.5-coder:3b", + "phi4-mini-reasoning:latest", + "gemma3:4b", + "phi3:latest", + "llama3.1:latest", + "myaniu/qwen2.5-1m:14b", + "phi4:14b", + "gemma3:27b" + ], + "ollama_parameters": { + "num_ctx": 65536, + "num_predict": 65536, + "temperature": 0.1, + "seed": 42 + }, + "ollama_base_url": "http://127.0.0.1:11434", + "timeout_s": 60, + "__input_dir": "/mnt/Models/articles2_marker/no_llm/small_md/", + "___input_dir": "/mnt/Models/articles2_marker/no_llm/markdown/", + "input_dir": "/mnt/Models/articles2_marker/no_llm/bench_md/", + "_input_dir": "input/md", + "out_dir": "outlines_output_db", + "full_schema_path": "schema/json/article.json", + "common_prompt_path": "passes/common.txt", + "db_path": "outlines_output_db/massive.sqlite", + "article_glob": "**/*.md", + "_pre_passes": [ + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + }, + { + "name": "SeqPrompt_strict", + "schema": "passes/_1_SeqPrompt/schema_strict.json", + "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", + "timeout": 60 + } + ], + "pre_passes": [ + { + "name": "SeqPrompt_strict", + "schema": "passes/_1_SeqPrompt/schema_strict.json", + "prompt": "passes/_1_SeqPrompt/prompt_strict.txt", + "timeout": 60 + }, + { + "name": "SeqPrompt", + "schema": "passes/_1_SeqPrompt/schema.json", + "prompt": "passes/_1_SeqPrompt/prompt.txt", + "timeout": 60 + } + ], + "construct_single_experiment_passes": [ + { + "name": "_4_ConstructSingleSmallExperiment", + "schema": "passes/_4_ConstructSingleSmallExperiment/schema.json", + "prompt": "passes/_4_ConstructSingleSmallExperiment/prompt.txt", + "timeout": 60 + }, + { + "name": "_6_ConstructSingleSequenceExperimentAndOutcome", + "schema": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json", + "prompt": "passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt", + "timeout": 60 + } + ], + "passes": [ + { + "name": "A_core", + "schema": "passes/A_core/schema.json", + "prompt": "passes/A_core/prompt.txt", + "timeout": 60 + }, + { + "name": "B_index", + "schema": "passes/B_index/schema.json", + "prompt": "passes/B_index/prompt.txt", + "timeout": 600 + }, + { + "name": "B1_index_types", + "schema": "passes/B1_index_types/schema.json", + "prompt": "passes/B1_index_types/prompt.txt", + "timeout": 600 + }, + { + "name": "B2_index_desc", + "schema": "passes/B2_index_desc/schema.json", + "prompt": "passes/B2_index_desc/prompt.txt", + "timeout": 600 + }, + { + "name": "C5_probes_opt_target", + "schema": "passes/C5_probes_opt_target/schema.json", + "prompt": "passes/C5_probes_opt_target/prompt.txt", + "timeout": 900 + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt", + "timeout": 900 + }, + { + "name": "C1_probe_core", + "schema": "passes/C1_probe_core/schema.json", + "prompt": "passes/C1_probe_core/prompt.txt" + }, + { + "name": "C2_target_primers", + "schema": "passes/C2_target_primers/schema.json", + "prompt": "passes/C2_target_primers/prompt.txt" + }, + { + "name": "C3_related", + "schema": "passes/C3_related/schema.json", + "prompt": "passes/C3_related/prompt.txt" + }, + { + "name": "D_parameters", + "schema": "passes/D_parameters/schema.json", + "prompt": "passes/D_parameters/prompt.txt" + }, + { + "name": "E_outcomes", + "schema": "passes/E_outcomes/schema.json", + "prompt": "passes/E_outcomes/prompt.txt" + }, + { + "name": "F_pairings", + "schema": "passes/F_pairings/schema.json", + "prompt": "passes/F_pairings/prompt.txt" + } + ], + "ignored_passes": [ + { + "name": "B_index", + "schema": "passes/B_index/schema.json", + "prompt": "passes/B_index/prompt.txt", + "timeout": 600 + }, + { + "name": "B2_index_desc", + "schema": "passes/B2_index_desc/schema.json", + "prompt": "passes/B2_index_desc/prompt.txt", + "timeout": 600 + }, + { + "name": "C5_probes_opt_target", + "schema": "passes/C5_probes_opt_target/schema.json", + "prompt": "passes/C5_probes_opt_target/prompt.txt", + "timeout": 900 + }, + { + "name": "C_sequences", + "schema": "passes/C_sequences/schema.json", + "prompt": "passes/C_sequences/prompt.txt", + "timeout": 900 + }, + { + "name": "C1_probe_core", + "schema": "passes/C1_probe_core/schema.json", + "prompt": "passes/C1_probe_core/prompt.txt" + }, + { + "name": "C2_target_primers", + "schema": "passes/C2_target_primers/schema.json", + "prompt": "passes/C2_target_primers/prompt.txt" + }, + { + "name": "C3_related", + "schema": "passes/C3_related/schema.json", + "prompt": "passes/C3_related/prompt.txt" + }, + { + "name": "D_parameters", + "schema": "passes/D_parameters/schema.json", + "prompt": "passes/D_parameters/prompt.txt" + }, + { + "name": "E_outcomes", + "schema": "passes/E_outcomes/schema.json", + "prompt": "passes/E_outcomes/prompt.txt" + }, + { + "name": "F_pairings", + "schema": "passes/F_pairings/schema.json", + "prompt": "passes/F_pairings/prompt.txt" + }, + { + "name": "full_schema", + "schema": "schemas/article.json", + "prompt": "passes/common.txt", + "timeout": 900 + } + ] +} \ No newline at end of file diff --git a/extraction/environment.yml b/extraction/environment.yml index 029859f..9f03939 100644 --- a/extraction/environment.yml +++ b/extraction/environment.yml @@ -1,20 +1,288 @@ -# environment.yml name: extraction channels: - conda-forge dependencies: - - python=3.11 - - pip + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - argon2-cffi=25.1.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=25.1.0=py311h49ec1c0_0 + - arrow=1.3.0=pyhd8ed1ab_1 + - asttokens=3.0.0=pyhd8ed1ab_1 + - async-lru=2.0.5=pyh29332c3_0 + - attrs=25.3.0=pyh71513ae_0 + - babel=2.17.0=pyhd8ed1ab_0 + - beautifulsoup4=4.14.2=pyha770c72_0 + - bleach=6.2.0=pyh29332c3_4 + - bleach-with-css=6.2.0=h82add2a_4 + - brotli-python=1.1.0=py311h1ddb823_4 + - bzip2=1.0.8=h4bc722e_7 + - ca-certificates=2025.8.3=hbd8a1cb_0 + - cached-property=1.5.2=hd8ed1ab_1 + - cached_property=1.5.2=pyha770c72_1 + - comm=0.2.3=pyhe01879c_0 + - debugpy=1.8.17=py311hc665b79_0 + - decorator=5.2.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - exceptiongroup=1.3.0=pyhd8ed1ab_0 + - executing=2.2.1=pyhd8ed1ab_0 + - fqdn=1.5.1=pyhd8ed1ab_1 + - h11=0.16.0=pyhd8ed1ab_0 + - h2=4.3.0=pyhcf101f3_0 + - hpack=4.1.0=pyhd8ed1ab_0 + - httpcore=1.0.9=pyh29332c3_0 + - httpx=0.28.1=pyhd8ed1ab_0 + - hyperframe=6.1.0=pyhd8ed1ab_0 + - idna=3.10=pyhd8ed1ab_1 + - importlib-metadata=8.7.0=pyhe01879c_1 + - ipykernel=6.30.1=pyh82676e8_0 + - ipython=9.5.0=pyhfa0c392_0 + - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0 + - ipywidgets=8.1.7=pyhd8ed1ab_0 + - isoduration=20.11.0=pyhd8ed1ab_1 + - jedi=0.19.2=pyhd8ed1ab_1 + - jinja2=3.1.6=pyhd8ed1ab_0 + - jsonpointer=3.0.0=py311h38be061_2 + - jsonschema=4.25.1=pyhe01879c_0 + - jsonschema-specifications=2025.9.1=pyhcf101f3_0 + - jsonschema-with-format-nongpl=4.25.1=he01879c_0 + - jupyter=1.1.1=pyhd8ed1ab_1 + - jupyter-lsp=2.3.0=pyhcf101f3_0 + - jupyter_client=8.6.3=pyhd8ed1ab_1 + - jupyter_console=6.6.3=pyhd8ed1ab_1 + - jupyter_core=5.8.1=pyh31011fe_0 + - jupyter_events=0.12.0=pyh29332c3_0 + - jupyter_server=2.17.0=pyhcf101f3_0 + - jupyter_server_terminals=0.5.3=pyhd8ed1ab_1 + - jupyterlab=4.4.9=pyhd8ed1ab_0 + - jupyterlab_pygments=0.3.0=pyhd8ed1ab_2 + - jupyterlab_server=2.27.3=pyhd8ed1ab_1 + - jupyterlab_widgets=3.0.15=pyhd8ed1ab_0 + - keyutils=1.6.3=hb9d3cd8_0 + - krb5=1.21.3=h659f571_0 + - lark=1.3.0=pyhd8ed1ab_0 + - ld_impl_linux-64=2.43=h712a8e2_4 + - libedit=3.1.20250104=pl5321h7949ede_0 + - libexpat=2.7.0=h5888daf_0 + - libffi=3.4.6=h2dba641_1 + - libgcc=15.1.0=h767d61c_2 + - libgcc-ng=15.1.0=h69a702a_2 + - libgomp=15.1.0=h767d61c_2 + - liblzma=5.8.1=hb9d3cd8_1 + - libnsl=2.0.1=hd590300_0 + - libsodium=1.0.20=h4ab18f5_0 + - libsqlite=3.49.2=hee588c1_0 + - libstdcxx=15.1.0=h8f9b012_2 + - libstdcxx-ng=15.1.0=h4852527_2 + - libuuid=2.38.1=h0b41bf4_0 + - libxcrypt=4.4.36=hd590300_1 + - libzlib=1.3.1=hb9d3cd8_2 + - matplotlib-inline=0.1.7=pyhd8ed1ab_1 + - mistune=3.1.4=pyhcf101f3_0 + - nbclient=0.10.2=pyhd8ed1ab_0 + - nbconvert-core=7.16.6=pyh29332c3_0 + - nbformat=5.10.4=pyhd8ed1ab_1 + - ncurses=6.5=h2d0b736_3 + - nest-asyncio=1.6.0=pyhd8ed1ab_1 + - notebook=7.4.6=pyhd8ed1ab_0 + - notebook-shim=0.2.4=pyhd8ed1ab_1 + - openssl=3.5.3=h26f9b46_1 + - overrides=7.7.0=pyhd8ed1ab_1 + - packaging=25.0=pyh29332c3_1 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - parso=0.8.5=pyhcf101f3_0 + - pexpect=4.9.0=pyhd8ed1ab_1 + - pickleshare=0.7.5=pyhd8ed1ab_1004 + - pip=25.1.1=pyh8b19718_0 + - platformdirs=4.4.0=pyhcf101f3_0 + - prometheus_client=0.23.1=pyhd8ed1ab_0 + - prompt-toolkit=3.0.52=pyha770c72_0 + - prompt_toolkit=3.0.52=hd8ed1ab_0 + - ptyprocess=0.7.0=pyhd8ed1ab_1 + - pure_eval=0.2.3=pyhd8ed1ab_1 + - pycparser=2.22=pyh29332c3_1 + - pysocks=1.7.1=pyha55dd90_7 + - python=3.11.12=h9e4cc4f_0_cpython + - python-fastjsonschema=2.21.2=pyhe01879c_0 + - python-json-logger=2.0.7=pyhd8ed1ab_0 + - python_abi=3.11=8_cp311 + - pytz=2025.2=pyhd8ed1ab_0 + - pyzmq=27.1.0=py311h2315fbb_0 + - readline=8.2=h8c095d6_2 + - referencing=0.36.2=pyh29332c3_0 + - rfc3339-validator=0.1.4=pyhd8ed1ab_1 + - rfc3986-validator=0.1.1=pyh9f0ad1d_0 + - rfc3987-syntax=1.1.0=pyhe01879c_1 + - rpds-py=0.27.1=py311h902ca64_1 + - send2trash=1.8.3=pyh0d859eb_1 + - setuptools=80.1.0=pyhff2d567_0 + - six=1.17.0=pyhe01879c_1 + - sniffio=1.3.1=pyhd8ed1ab_1 + - soupsieve=2.8=pyhd8ed1ab_0 + - stack_data=0.6.3=pyhd8ed1ab_1 + - terminado=0.18.1=pyh0d859eb_0 + - tinycss2=1.4.0=pyhd8ed1ab_0 + - tk=8.6.13=noxft_h4845f30_101 + - tomli=2.2.1=pyhe01879c_2 + - tornado=6.5.2=py311h49ec1c0_1 + - traitlets=5.14.3=pyhd8ed1ab_1 + - types-python-dateutil=2.9.0.20250822=pyhd8ed1ab_0 + - typing_extensions=4.15.0=pyhcf101f3_0 + - typing_utils=0.1.0=pyhd8ed1ab_1 + - uri-template=1.3.0=pyhd8ed1ab_1 + - wcwidth=0.2.14=pyhd8ed1ab_0 + - webcolors=24.11.1=pyhd8ed1ab_0 + - webencodings=0.5.1=pyhd8ed1ab_3 + - websocket-client=1.8.0=pyhd8ed1ab_1 + - wheel=0.45.1=pyhd8ed1ab_1 + - widgetsnbextension=4.0.14=pyhd8ed1ab_0 + - yaml=0.2.5=h280c20c_3 + - zeromq=4.3.5=h387f397_9 + - zipp=3.23.0=pyhd8ed1ab_0 + - zstandard=0.25.0=py311haee01d2_0 + - zstd=1.5.7=hb8e6e7a_2 - pip: - - torch # for local LLM - - transformers # huggingface - - tqdm # progress bars - - pdfplumber # PDF text layer - - pytesseract # OCR fallback - - pillow # image support - - dicttoxml # JSON→XML conversion - - lxml # XML + DTD validation - - rich # pretty logging - - accelerate - - json5 - + - accelerate==1.6.0 + - altair==5.5.0 + - annotated-types==0.7.0 + - anthropic==0.46.0 + - anyio==4.10.0 + - argcomplete==3.5.1 + - bitsandbytes==0.46.1 + - blinker==1.9.0 + - borb==2.1.25 + - cachetools==6.2.0 + - certifi==2025.4.26 + - cffi==1.17.1 + - cfgv==3.4.0 + - charset-normalizer==3.4.2 + - click==8.3.0 + - cloudpickle==3.1.1 + - cobble==0.1.4 + - cryptography==44.0.3 + - cssselect2==0.8.0 + - curlify==2.2.1 + - dicttoxml==1.7.16 + - diskcache==5.6.3 + - distlib==0.4.0 + - distro==1.9.0 + - dunamai==1.25.0 + - ebooklib==0.18 + - einops==0.8.1 + - et-xmlfile==2.0.0 + - filelock==3.18.0 + - filetype==1.2.0 + - fonttools==4.59.0 + - fsspec==2025.3.2 + - ftfy==6.3.1 + - genson==1.3.0 + - gitdb==4.0.12 + - gitpython==3.1.45 + - google-auth==2.41.1 + - google-genai==1.40.0 + - hf-xet==1.1.10 + - huggingface-hub==0.35.3 + - identify==2.6.15 + - jiter==0.11.0 + - joblib==1.5.2 + - json-repair==0.48.0 + - json5==0.12.0 + - jsonpath-ng==1.7.0 + - loguru==0.7.3 + - lxml==5.4.0 + - mammoth==1.11.0 + - markdown-it-py==3.0.0 + - markdown2==2.5.4 + - markdownify==1.2.0 + - marker-pdf==1.10.1 + - markupsafe==3.0.2 + - mdurl==0.1.2 + - mpmath==1.3.0 + - narwhals==2.6.0 + - networkx==3.4.2 + - nodeenv==1.9.1 + - numpy==2.2.5 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - ollama==0.6.0 + - openai==1.109.1 + - opencv-python-headless==4.11.0.86 + - openpyxl==3.1.5 + - outlines==1.2.5 + - outlines-core==0.2.11 + - pandas==2.3.3 + - pdfminer-six==20250327 + - pdfplumber==0.11.6 + - pdftext==0.6.3 + - pillow==10.4.0 + - ply==3.11 + - poetry-dynamic-versioning==1.9.1 + - pre-commit==4.3.0 + - protobuf==6.32.1 + - psutil==7.0.0 + - pyarrow==21.0.0 + - pyasn1==0.6.1 + - pyasn1-modules==0.4.2 + - pydantic==2.11.9 + - pydantic-core==2.33.2 + - pydantic-settings==2.11.0 + - pydeck==0.9.1 + - pydyf==0.11.0 + - pygments==2.19.1 + - pymupdf==1.26.3 + - pypdfium2==4.30.0 + - pyphen==0.17.2 + - pytesseract==0.3.13 + - python-barcode==0.15.1 + - python-dateutil==2.6.1 + - python-dotenv==1.1.1 + - python-pptx==1.0.2 + - pyyaml==6.0.2 + - qrcode==8.2 + - rapidfuzz==3.14.1 + - regex==2024.11.6 + - requests==2.32.3 + - rich==14.0.0 + - rsa==4.9.1 + - safetensors==0.5.3 + - scikit-learn==1.7.2 + - scipy==1.16.2 + - smmap==5.0.2 + - streamlit==1.50.0 + - streamlit-ace==0.1.1 + - surya-ocr==0.17.0 + - sympy==1.14.0 + - tenacity==9.1.2 + - threadpoolctl==3.6.0 + - tinyhtml5==2.0.0 + - tokenizers==0.22.1 + - toml==0.10.2 + - tomlkit==0.13.3 + - torch==2.7.0 + - tqdm==4.67.1 + - transformers==4.56.2 + - triton==3.3.0 + - typing-extensions==4.13.2 + - typing-inspection==0.4.1 + - tzdata==2025.2 + - urllib3==2.2.3 + - vastai==0.3.1 + - virtualenv==20.34.0 + - watchdog==6.0.0 + - weasyprint==63.1 + - websockets==15.0.1 + - xdg==6.0.0 + - xlsxwriter==3.2.9 + - zopfli==0.2.3.post1 +prefix: /home/tux/miniforge3/envs/extraction diff --git a/extraction/extract_articles.py b/extraction/extract_articles.py index 53eea8c..5158ef6 100644 --- a/extraction/extract_articles.py +++ b/extraction/extract_articles.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import re import json5 as json diff --git a/extraction/hyb_db.py b/extraction/hyb_db.py new file mode 100644 index 0000000..2e5d0ac --- /dev/null +++ b/extraction/hyb_db.py @@ -0,0 +1,1600 @@ +# -*- coding: utf-8 -*- +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import outlines +from outlines.types import JsonSchema +import ollama +import re +from typing import Optional, Tuple, Dict, Any, List +import json +from pathlib import Path +from tqdm import tqdm +#from __future__ import annotations +import sqlite3 +from contextlib import contextmanager, closing +from datetime import datetime, timezone +from loguru import logger +from ollama import chat, ChatResponse +from json_repair import repair_json +import os, sys +from jsonschema import Draft202012Validator + +# -*- coding: utf-8 -*- +""" +SQLite dataset builder for hybridization-article extractions. + +Public API: + init_db(db_path) + + # HYBRIDIZATION ARTICLE / EXPERIMENT STRUCTURE + insert_article_object(db_path, article_obj, model_name, article_name) + insert_seqdesc_object(...) + + # PERFORMANCE / PIPELINE METRICS + insert_perf_event(db_path, event_dict) + insert_perf_events(db_path, [event_dict, ...]) + + # NEW (ADDED FOR PIPELINE CONTINUATION + SIDE-CAR PERF TRACKING) + insert_pipeline_artifact(db_path, artifact_dict) + get_pipeline_artifacts_for_article(db_path, model_name, article_name) + get_completed_passes(db_path, model_name, article_name) + +Features: +- Auto-initializes schema (tables, indexes, views). +- Preserves every run (no overwrites). +- Normalizes sense/antisense & prime markers. +- Guards against non-oligo "probes" (skips probe insertion but keeps experiment). +- Includes Ollama-style helper tools with Google docstrings. +- perf_events table for timings/tokens of every step/question. +- NEW: pipeline_artifacts table for per-pass / per-file sidecar metrics and + continuation bookkeeping. Each JSON artifact the pipeline writes on disk + (per pass, per article, per model) can have a "sidecar" JSON with timing + and token usage. We mirror that data into pipeline_artifacts so that: + * downstream QC / benchmarking code can query timings and tokens + * the pipeline can resume/continue work by checking which passes for a + given (model_name, article_name) have already succeeded. + The pipeline will: + - emit a sidecar JSON next to every produced .json/.log.json/etc. file + containing timing, token counts, and file paths + - call insert_pipeline_artifact(...) with the same metadata + Because this module always calls _ensure_schema() on connect, the new table + will be created automatically in older existing DBs without migration steps. + +Tables overview +--------------- +articles / runs / raw_payloads / experiments / ... : + Structured hybridization experiment data (final stitched objects). + +seqdesc_* : + Per-sequence descriptors from sequence descriptor passes. + +perf_events : + Fine-grained timing and token usage for any granular step/question. + +pipeline_artifacts : + Coarse-grained artifact-level bookkeeping used for: + - sidecar perf metrics per produced JSON artifact + - continuation / resume logic (which pipeline passes already finished) +""" + +import json +import re +import sqlite3 +from contextlib import contextmanager +from datetime import datetime, timezone +from typing import Any, Dict, Optional, Tuple, List + + +@contextmanager +def _db(db_path: str): + """Context manager for SQLite connection with FK + WAL enabled.""" + conn = sqlite3.connect(db_path) + try: + conn.execute("PRAGMA foreign_keys = ON;") + conn.execute("PRAGMA journal_mode = WAL;") + yield conn + conn.commit() + finally: + conn.close() + + +# ----------------------------- Schema DDL ----------------------------- # + +_TABLES_AND_INDEXES_SQL = """ +CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY, + doi TEXT NOT NULL UNIQUE, + latest_article_name TEXT, + latest_abstract TEXT, + latest_topic TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY, + article_id INTEGER NOT NULL, + model_name TEXT NOT NULL, + article_name TEXT, + branch TEXT NOT NULL CHECK (branch IN ('experiments','no_sequences')), + created_at TEXT NOT NULL, + FOREIGN KEY (article_id) REFERENCES articles(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_runs_article ON runs(article_id); +CREATE INDEX IF NOT EXISTS idx_runs_created ON runs(created_at); +CREATE INDEX IF NOT EXISTS idx_runs_model ON runs(model_name); + +CREATE TABLE IF NOT EXISTS raw_payloads ( + run_id INTEGER PRIMARY KEY, + json TEXT NOT NULL, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS experiments ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + id_exp TEXT NOT NULL, + type TEXT, + description TEXT NOT NULL, + raw_description TEXT, + organism TEXT, + technology TEXT, + annealing_qualitative INTEGER, -- NULL/0/1 + rna_impurities_qualitative INTEGER, -- NULL/0/1 + UNIQUE (run_id, id_exp), + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_experiments_run ON experiments(run_id); +CREATE INDEX IF NOT EXISTS idx_experiments_idexp ON experiments(id_exp); + +CREATE TABLE IF NOT EXISTS oligos ( + id INTEGER PRIMARY KEY, + raw TEXT NOT NULL, + sequence TEXT, + length_bases INTEGER, + prime_prefix INTEGER CHECK (prime_prefix IN (3,5)), + five_prime_label TEXT, + three_prime_label TEXT, + sense_antisense TEXT CHECK (sense_antisense IN ('sense','antisense')), + provenance_source_type TEXT, + provenance_page INTEGER, + provenance_section TEXT, + provenance_quote TEXT, + provenance_notes TEXT +); +CREATE INDEX IF NOT EXISTS idx_oligos_seq ON oligos(sequence); + +CREATE TABLE IF NOT EXISTS probes ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + name TEXT NOT NULL, + amplicon_id TEXT, + oligo_id INTEGER NOT NULL, + fluorophore TEXT, + quencher TEXT, + sense_antisense TEXT CHECK (sense_antisense IN ('sense','antisense')), + notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_probes_name ON probes(name); +CREATE INDEX IF NOT EXISTS idx_probes_exp ON probes(experiment_id); + +CREATE TABLE IF NOT EXISTS target_sequences ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + oligo_id INTEGER NOT NULL, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); + +CREATE TABLE IF NOT EXISTS primer_pairs ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + forward_oligo_id INTEGER NOT NULL, + reverse_oligo_id INTEGER NOT NULL, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (forward_oligo_id) REFERENCES oligos(id), + FOREIGN KEY (reverse_oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_primers_exp ON primer_pairs(experiment_id); + +CREATE TABLE IF NOT EXISTS related_sequences ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + oligo_id INTEGER NOT NULL, + description TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE, + FOREIGN KEY (oligo_id) REFERENCES oligos(id) +); +CREATE INDEX IF NOT EXISTS idx_relseqs_exp ON related_sequences(experiment_id); + +CREATE TABLE IF NOT EXISTS outcomes ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + outcome INTEGER, -- NULL/0/1 + comparative_notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_outcomes_exp ON outcomes(experiment_id); + +CREATE TABLE IF NOT EXISTS measurements ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + key TEXT NOT NULL, + raw TEXT NOT NULL, + value REAL, + unit TEXT, + si_value REAL, + si_unit TEXT, + assumptions TEXT, + provenance_source_type TEXT, + provenance_page INTEGER, + provenance_section TEXT, + provenance_quote TEXT, + provenance_notes TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_measurements_exp_key ON measurements(experiment_id, key); + +CREATE TABLE IF NOT EXISTS pairings ( + id INTEGER PRIMARY KEY, + experiment_id INTEGER NOT NULL, + paired_with_probe_name TEXT, + relationship TEXT, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_pairings_exp ON pairings(experiment_id); + +CREATE TABLE IF NOT EXISTS extraction_report_entries ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + experiment_id INTEGER, + kind TEXT NOT NULL CHECK (kind IN ('missing','uncertain')), + json_pointer TEXT NOT NULL, + notes TEXT, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE, + FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_report_run_kind ON extraction_report_entries(run_id, kind); + +CREATE TABLE IF NOT EXISTS no_sequences_explanations ( + id INTEGER PRIMARY KEY, + run_id INTEGER NOT NULL, + explanation TEXT NOT NULL, + FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_no_seq_run ON no_sequences_explanations(run_id); + +/* generic performance/timing/token metrics for all steps and questions */ +CREATE TABLE IF NOT EXISTS perf_events ( + id INTEGER PRIMARY KEY, + namespace TEXT NOT NULL CHECK (namespace IN ('pre_pass','pass','query','construct','stitch','db_insert','other')), + article_name TEXT, + model_name TEXT, + article_doi TEXT, + pass_name TEXT, + sequence_key TEXT, + question_param TEXT, + started_at TEXT, + finished_at TEXT, + duration_ms REAL, + prompt_tokens INTEGER, + completion_tokens INTEGER, + total_tokens INTEGER, + tokens_per_sec REAL, + sidecar_path TEXT, + notes TEXT +); +CREATE INDEX IF NOT EXISTS idx_perf_ns_article_model ON perf_events(namespace, article_name, model_name); + +/* NEW TABLE: + pipeline_artifacts captures artifact-level metadata (per produced JSON file). + It is designed for: + - performance sidecar ingestion (duration, token counts, sidecar paths) + - resume/continuation bookkeeping (which passes are already done) + The UNIQUE constraint prevents exact duplicate rows for the same (model,article,pass,file), + but allows multiple historical attempts over time if artifact_path differs + (timestamps are embedded in filenames in the pipeline). +*/ +CREATE TABLE IF NOT EXISTS pipeline_artifacts ( + id INTEGER PRIMARY KEY, + model_name TEXT NOT NULL, + article_name TEXT NOT NULL, + pass_name TEXT NOT NULL, + artifact_path TEXT NOT NULL, + sidecar_path TEXT, + started_at TEXT, + finished_at TEXT, + duration_ms REAL, + prompt_tokens INTEGER, + completion_tokens INTEGER, + total_tokens INTEGER, + tokens_per_sec REAL, + success INTEGER, -- NULL/0/1 + notes TEXT, + UNIQUE (model_name, article_name, pass_name, artifact_path) +); +CREATE INDEX IF NOT EXISTS idx_pa_lookup ON pipeline_artifacts(model_name, article_name, pass_name); +CREATE INDEX IF NOT EXISTS idx_pa_finished ON pipeline_artifacts(finished_at); +""" + +_VIEWS_SQL = """ +CREATE VIEW IF NOT EXISTS view_experiments_flat AS +SELECT + a.doi AS doi, + r.model_name AS model_name, + r.article_name AS article_name, + r.created_at AS run_created_at, + e.id AS experiment_id, + e.id_exp AS id_exp, + e.type AS exp_type, + e.description AS exp_description, + e.organism AS organism, + e.technology AS technology, + p.name AS probe_name, + p.amplicon_id AS amplicon_id, + p.fluorophore AS probe_fluorophore, + p.quencher AS probe_quencher, + po.sequence AS probe_sequence, + po.five_prime_label AS probe_5p_label, + po.three_prime_label AS probe_3p_label, + tgo.sequence AS target_sequence, + o.outcome AS outcome_bool, + o.comparative_notes AS outcome_notes +FROM experiments e +JOIN runs r ON r.id = e.run_id +JOIN articles a ON a.id = r.article_id +LEFT JOIN probes p ON p.experiment_id = e.id +LEFT JOIN oligos po ON po.id = p.oligo_id +LEFT JOIN target_sequences ts ON ts.experiment_id = e.id +LEFT JOIN oligos tgo ON tgo.id = ts.oligo_id +LEFT JOIN outcomes o ON o.experiment_id = e.id; + +CREATE VIEW IF NOT EXISTS view_measurements_flat AS +SELECT + a.doi, + r.model_name, + r.article_name, + r.created_at AS run_created_at, + e.id AS experiment_id, + e.id_exp, + m.key, + m.raw, + m.value, + m.unit, + m.si_value, + m.si_unit, + m.assumptions +FROM measurements m +JOIN experiments e ON e.id = m.experiment_id +JOIN runs r ON r.id = e.run_id +JOIN articles a ON a.id = r.article_id; +""" + + +def _ensure_schema(conn: sqlite3.Connection) -> None: + """Create all tables, indexes, and views if they don't exist.""" + cur = conn.cursor() + cur.executescript(_TABLES_AND_INDEXES_SQL) + cur.executescript(_VIEWS_SQL) + conn.commit() + + +# ----------------------------- Public API ----------------------------- # + +def init_db(db_path: str) -> None: + """Create (if not exists) the SQLite database schema, indices, and views. + + Args: + db_path: Path to the SQLite file. Created if it doesn't exist. + """ + with _db(db_path) as conn: + _ensure_schema(conn) + + +def insert_article_object(db_path: str, article_obj: Dict[str, Any], + model_name: str, article_name: Optional[str]) -> int: + """Insert a schema-conformant JSON object into the SQLite DB. + + Auto-creates the DB schema if missing. Preserves every run. + + Args: + db_path: SQLite file path. + article_obj: Dict that conforms to the Hybridization Article schema. + model_name: Model identifier (e.g., 'Qwen2.5-Instruct-1M:14b'). + article_name: Name/key for the source file processed. + + Returns: + run_id (int) for this insertion. + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + + doi = article_obj.get("doi", "unknown") + if not doi: + raise ValueError("Input must contain a top-level 'doi' string.") + + has_experiments = isinstance(article_obj.get("experiments"), list) + branch = "experiments" if has_experiments else "no_sequences" + + article_id = _get_or_create_article( + cur, + doi=doi, + article_name=article_name or article_obj.get("article_name"), + abstract=article_obj.get("abstract"), + topic=article_obj.get("topic"), + ) + + run_id = _create_run(cur, article_id, model_name, article_name, branch, raw_json=article_obj) + + # Top-level extraction report (if any) + _insert_extraction_report(cur, run_id, article_obj.get("extraction_report"), experiment_id=None) + + if branch == "no_sequences": + explanation = article_obj.get("explanation_why_does_not_this_article_have_any_hybridization_probes_sequences") or "" + cur.execute( + "INSERT INTO no_sequences_explanations (run_id, explanation) VALUES (?, ?)", + (run_id, explanation), + ) + return run_id + + # ---- experiments branch ---- + for exp in (article_obj.get("experiments") or []): + id_exp = exp.get("id_exp") + desc = exp.get("description") or "" + cur.execute( + """ + INSERT INTO experiments + (run_id, id_exp, type, description, raw_description, + organism, technology, annealing_qualitative, rna_impurities_qualitative) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + run_id, + id_exp, + exp.get("type"), + desc, + exp.get("raw_description"), + (exp.get("metadata") or {}).get("organism"), + (exp.get("metadata") or {}).get("technology"), + _to_int_bool(((exp.get("metadata") or {}).get("annealing") or {}).get("qualitative")), + _to_int_bool(((exp.get("metadata") or {}).get("rna_impurities") or {}).get("qualitative")), + ), + ) + experiment_id = cur.lastrowid + + # Per-experiment extraction report + _insert_extraction_report(cur, run_id, exp.get("extraction_report"), experiment_id=experiment_id) + + # Sequences + seqs = exp.get("sequences") or {} + + # Validate the probe looks like a real oligo + probe = seqs.get("probe") or {} + if not _has_real_probe(probe): + # Record and skip probe insertion, but keep experiment row and any metadata/measurements/outcomes + _insert_extraction_report( + cur, run_id, + {"missing": ["/experiments/*/sequences/probe/oligo/sequence"], + "notes": "Rejected probable non-oligo probe (no bases/labels/length)."}, + experiment_id=experiment_id, + ) + else: + # Target (optional) + tgt = seqs.get("target_sequence") + if isinstance(tgt, dict) and (tgt.get("raw") is not None): + tgt_oligo_id = _insert_oligo(cur, tgt) + cur.execute( + "INSERT INTO target_sequences (experiment_id, oligo_id) VALUES (?, ?)", + (experiment_id, tgt_oligo_id), + ) + + # Probe (required by schema; normalized before insert) + probe_oligo = probe.get("oligo") or {} + probe_oligo_id = _insert_oligo(cur, probe_oligo) + sa = _coerce_sa(probe.get("sense_antisense"), probe.get("name")) + cur.execute( + """ + INSERT INTO probes + (experiment_id, name, amplicon_id, oligo_id, fluorophore, quencher, sense_antisense, notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + experiment_id, + probe.get("name"), + probe.get("amplicon_id"), + probe_oligo_id, + probe.get("fluorophore"), + probe.get("quencher"), + sa, + probe.get("notes"), + ), + ) + + # Primers (optional) + primers = seqs.get("primer_sequences") + if isinstance(primers, dict): + fwd = primers.get("forward") or {} + rev = primers.get("reverse") or {} + fwd_id = _insert_oligo(cur, fwd) + rev_id = _insert_oligo(cur, rev) + cur.execute( + "INSERT INTO primer_pairs (experiment_id, forward_oligo_id, reverse_oligo_id) VALUES (?, ?, ?)", + (experiment_id, fwd_id, rev_id), + ) + + # Related sequences (0..N) + for rs in (seqs.get("related_sequences") or []): + r_oligo = rs.get("related_sequence") + if isinstance(r_oligo, dict) and (r_oligo.get("raw") is not None): + r_oligo_id = _insert_oligo(cur, r_oligo) + cur.execute( + "INSERT INTO related_sequences (experiment_id, oligo_id, description) VALUES (?, ?, ?)", + (experiment_id, r_oligo_id, rs.get("description")), + ) + + # Measurements (experiment_properties + metadata) + exprops = exp.get("experiment_properties") or {} + concs = (exprops.get("concentrations") or {}) + _insert_measurement(cur, experiment_id, "experiment_properties.concentrations.dna_rna_concentration", + concs.get("dna_rna_concentration")) + _insert_measurement(cur, experiment_id, "experiment_properties.concentrations.concentration_SI", + concs.get("concentration_SI")) + + params = (exprops.get("parameters_SI") or {}) + for key in ("temperature", "Tris", "Na", "K", "Mg", "DMSO"): + _insert_measurement(cur, experiment_id, f"experiment_properties.parameters_SI.{key}", params.get(key)) + + meta = exp.get("metadata") or {} + _insert_measurement(cur, experiment_id, "metadata.pH", meta.get("pH")) + ann = meta.get("annealing") or {} + _insert_measurement(cur, experiment_id, "metadata.annealing.quantitative", ann.get("quantitative")) + rimp = meta.get("rna_impurities") or {} + _insert_measurement(cur, experiment_id, "metadata.rna_impurities.quantitative", rimp.get("quantitative")) + + # Outcome + out = exp.get("outcome") or {} + cur.execute( + "INSERT INTO outcomes (experiment_id, outcome, comparative_notes) VALUES (?, ?, ?)", + (experiment_id, _to_int_bool(out.get("outcome")), out.get("comparative_notes")), + ) + _insert_measurement(cur, experiment_id, "outcome.fluorescence", out.get("fluorescence")) + + # Pairing (optional) + pair = exp.get("pairing") or {} + if pair.get("paired_with_probe_name") or pair.get("relationship"): + cur.execute( + "INSERT INTO pairings (experiment_id, paired_with_probe_name, relationship) VALUES (?, ?, ?)", + (experiment_id, pair.get("paired_with_probe_name"), pair.get("relationship")), + ) + + return run_id + + +# NEW: perf events API -------------------------------------------------- # + +def _event_defaults(ev: Dict[str, Any]) -> Dict[str, Any]: + d = dict(ev or {}) + for k in ("namespace","article_name","model_name","article_doi","pass_name", + "sequence_key","question_param","started_at","finished_at", + "duration_ms","prompt_tokens","completion_tokens","total_tokens", + "tokens_per_sec","sidecar_path","notes"): + d.setdefault(k, None) + return d + +def insert_perf_event(db_path: str, event: Dict[str, Any]) -> int: + """Insert a single performance/timing event row. + + Typical usage: + insert_perf_event(db_path, { + "namespace": "pass", + "article_name": "paper123", + "model_name": "my/model:7b", + "article_doi": "10.xxxx/yyy", + "pass_name": "A_core", + "sequence_key": None, + "question_param": None, + "started_at": "...", + "finished_at": "...", + "duration_ms": 1234.5, + "prompt_tokens": 4567, + "completion_tokens": 890, + "total_tokens": 5457, + "tokens_per_sec": 12.34, + "sidecar_path": "/path/to/file.sidecar.json", + "notes": "ok" + }) + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + e = _event_defaults(event) + cur.execute( + """ + INSERT INTO perf_events ( + namespace, article_name, model_name, article_doi, pass_name, + sequence_key, question_param, started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + sidecar_path, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + e["namespace"], e["article_name"], e["model_name"], e["article_doi"], e["pass_name"], + e["sequence_key"], e["question_param"], e["started_at"], e["finished_at"], e["duration_ms"], + e["prompt_tokens"], e["completion_tokens"], e["total_tokens"], e["tokens_per_sec"], + e["sidecar_path"], e["notes"] + ), + ) + return cur.lastrowid + +def insert_perf_events(db_path: str, events: List[Dict[str, Any]]) -> List[int]: + """Bulk insert multiple performance events.""" + ids: List[int] = [] + if not events: + return ids + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + for ev in events: + e = _event_defaults(ev) + cur.execute( + """ + INSERT INTO perf_events ( + namespace, article_name, model_name, article_doi, pass_name, + sequence_key, question_param, started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + sidecar_path, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + e["namespace"], e["article_name"], e["model_name"], e["article_doi"], e["pass_name"], + e["sequence_key"], e["question_param"], e["started_at"], e["finished_at"], e["duration_ms"], + e["prompt_tokens"], e["completion_tokens"], e["total_tokens"], e["tokens_per_sec"], + e["sidecar_path"], e["notes"] + ), + ) + ids.append(cur.lastrowid) + return ids + + +# NEW: pipeline_artifacts API ------------------------------------------ # + +def _artifact_defaults(rec: Dict[str, Any]) -> Dict[str, Any]: + """Normalize/complete an artifact record dict before DB insert. + + Expected keys in `rec`: + model_name : str + article_name : str + pass_name : str (e.g. 'A_core', 'SeqDesc-OPTIM', 'FULL') + artifact_path : str (absolute or project-rel path to main JSON) + sidecar_path : str|None (path to sidecar .perf.json) + started_at : str|None (ISO8601 UTC) + finished_at : str|None (ISO8601 UTC) + duration_ms : float|None + prompt_tokens : int|None + completion_tokens : int|None + total_tokens : int|None + tokens_per_sec : float|None + success : bool|int|None + notes : str|None + """ + d = dict(rec or {}) + for k in ( + "model_name", "article_name", "pass_name", "artifact_path", + "sidecar_path", "started_at", "finished_at", "duration_ms", + "prompt_tokens", "completion_tokens", "total_tokens", "tokens_per_sec", + "success", "notes" + ): + d.setdefault(k, None) + + # Convert success -> int bool (1/0/NULL) + d["success"] = _to_int_bool(d.get("success")) + return d + + +def insert_pipeline_artifact(db_path: str, artifact: Dict[str, Any]) -> int: + """Insert a single pipeline_artifacts row. + + This captures per-pass / per-article / per-model artifact bookkeeping + (timings + token usage for the JSON file) as well as marking a pass + as 'successfully finished' for continuation. + + NOTE: + We do a plain INSERT. The table has a UNIQUE constraint on + (model_name, article_name, pass_name, artifact_path), so the pipeline + should generate unique artifact_path names (it already includes a + timestamped suffix in filenames). We intentionally do NOT overwrite. + + Returns: + row_id (int). + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + r = _artifact_defaults(artifact) + cur.execute( + """ + INSERT INTO pipeline_artifacts ( + model_name, article_name, pass_name, artifact_path, sidecar_path, + started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + success, notes + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + r["model_name"], + r["article_name"], + r["pass_name"], + r["artifact_path"], + r["sidecar_path"], + r["started_at"], + r["finished_at"], + r["duration_ms"], + r["prompt_tokens"], + r["completion_tokens"], + r["total_tokens"], + r["tokens_per_sec"], + r["success"], + r["notes"], + ), + ) + return cur.lastrowid + + +def get_pipeline_artifacts_for_article( + db_path: str, + model_name: str, + article_name: str, +) -> List[Dict[str, Any]]: + """Fetch all recorded artifacts for (model_name, article_name). + + This is useful for: + - debugging + - external QC scripts + - continuation logic in the pipeline (to see what's already done) + + Returns: + A list (possibly empty) of dicts, newest-first by finished_at (NULL last). + """ + with _db(db_path) as conn: + _ensure_schema(conn) + cur = conn.cursor() + cur.execute( + """ + SELECT + id, + model_name, + article_name, + pass_name, + artifact_path, + sidecar_path, + started_at, + finished_at, + duration_ms, + prompt_tokens, + completion_tokens, + total_tokens, + tokens_per_sec, + success, + notes + FROM pipeline_artifacts + WHERE model_name = ? + AND article_name = ? + ORDER BY + CASE WHEN finished_at IS NULL THEN 1 ELSE 0 END, + finished_at DESC + """, + (model_name, article_name), + ) + rows = cur.fetchall() + + out_rows: List[Dict[str, Any]] = [] + for row in rows: + ( + rid, mname, aname, pass_name, artifact_path, sidecar_path, + started_at, finished_at, duration_ms, + prompt_tokens, completion_tokens, total_tokens, tokens_per_sec, + success, notes + ) = row + out_rows.append( + { + "id": rid, + "model_name": mname, + "article_name": aname, + "pass_name": pass_name, + "artifact_path": artifact_path, + "sidecar_path": sidecar_path, + "started_at": started_at, + "finished_at": finished_at, + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "success": bool(success) if success is not None else None, + "notes": notes, + } + ) + return out_rows + + +def get_completed_passes( + db_path: str, + model_name: str, + article_name: str, +) -> Dict[str, Dict[str, Any]]: + """Return a summary of which logical passes have already completed + successfully for (model_name, article_name). + + The pipeline can use this to implement continuation / resume: + - If a pass_name appears here with success True, the pipeline MAY skip + regenerating that pass for that (model, article), unless --fresh was + requested or the pipeline.json layout changed and the user wants that + pass rerun anyway. + + Returns: + dict: + { + "A_core": { + "finished_at": "...", + "artifact_path": ".../paper__A_core__model__timestamp.json", + "sidecar_path": ".../paper__A_core__model__timestamp.perf.json", + "duration_ms": 1234.5, + "total_tokens": 9876, + ... + }, + ... + } + + If multiple rows exist for the same pass_name we pick the most recent row + with success == 1 (by finished_at DESC). If none are successful for a + pass, that pass won't appear in the dict. + """ + artifacts = get_pipeline_artifacts_for_article( + db_path=db_path, model_name=model_name, article_name=article_name + ) + + best: Dict[str, Dict[str, Any]] = {} + for row in artifacts: + pname = row["pass_name"] + if not row.get("success"): + continue + prev = best.get(pname) + if prev is None: + best[pname] = row + continue + # pick newer finished_at + prev_finished = prev.get("finished_at") + cur_finished = row.get("finished_at") + # if prev_finished is None but cur_finished not None -> prefer current + # if both not None -> compare lexicographically (ISO8601 so OK) + take_current = False + if prev_finished is None and cur_finished is not None: + take_current = True + elif prev_finished is not None and cur_finished is not None: + if str(cur_finished) > str(prev_finished): + take_current = True + elif prev_finished is None and cur_finished is None: + # keep first, arbitrary + take_current = False + # else prev has finished_at but current doesn't — keep prev. + if take_current: + best[pname] = row + + # Only expose a shallow summary for convenience + summarized: Dict[str, Dict[str, Any]] = {} + for pname, row in best.items(): + summarized[pname] = { + "finished_at": row.get("finished_at"), + "artifact_path": row.get("artifact_path"), + "sidecar_path": row.get("sidecar_path"), + "duration_ms": row.get("duration_ms"), + "prompt_tokens": row.get("prompt_tokens"), + "completion_tokens": row.get("completion_tokens"), + "total_tokens": row.get("total_tokens"), + "tokens_per_sec": row.get("tokens_per_sec"), + "notes": row.get("notes"), + } + return summarized + + +# ----------------------------- Ollama-style helper tools ----------------------------- # + +def to_si(value: Optional[float], unit: Optional[str]) -> Tuple[Optional[float], Optional[str]]: + """Convert a numeric value and unit to SI. + + Supports common units from hybridization papers: + - Temperature: °C -> K (K = °C + 273.15), K stays K. + - Concentration: M, mM, µM/um, nM -> mol/m^3 (1 mM = 1 mol/m^3). + - Percent: % -> dimensionless fraction (value/100). + + Args: + value: The numeric value parsed from the article, or None if unknown. + unit: The unit string as written in the article (e.g., '°C', 'C', 'mM', '%'), or None. + + Returns: + A pair (si_value, si_unit): + - si_value: The value converted to SI, or None if not convertible. + - si_unit: The SI unit string ('K', 'mol/m^3', 'dimensionless'), or None if not convertible. + + Examples: + >>> to_si(25, '°C') + (298.15, 'K') + >>> to_si(2, 'mM') + (2.0, 'mol/m^3') + >>> to_si(10, '%') + (0.1, 'dimensionless') + """ + if value is None or unit is None: + return None, None + + u = unit.strip().lower().replace('µ', 'u') + # Temperature + if u in {'c', '°c', 'deg c', 'celsius'}: + return value + 273.15, 'K' + if u in {'k', 'kelvin'}: + return value, 'K' + + # Concentration (to mol/m^3) + if u in {'m', 'mol/l'}: + return value * 1000.0, 'mol/m^3' + if u in {'mm', 'mmol/l', 'mmol', 'mm'}: # 'mm' for mM as often OCR'd + return value * 1.0, 'mol/m^3' + if u in {'um', 'umol/l', 'µm', 'µmol/l', 'micromolar'}: + return value * 1e-3, 'mol/m^3' + if u in {'nm', 'nmol/l', 'nanomolar'}: + return value * 1e-6, 'mol/m^3' + + # Percent + if u in {'%', 'percent', 'perc'}: + return value / 100.0, 'dimensionless' + + return None, None + + +OLIGO_RE = re.compile(r""" +^\s* +(?:(?P(?:5|3)(?:['′’]|0|O)?)\s*-\s*)? +(?:(?P(?:[A-Za-z0-9+]+-)+))? +(?P[ACGUTRYSWKMBDHVN]+) +(?:(?P(?:-[A-Za-z0-9+]+)+))? +(?:\s*\(\s*(?P\d+)\s*(?:b|bp)\s*\)\s*)? +\s*$ +""", re.X) + +def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: + """Parse a decorated oligo string into schema-ready parts. + + Accepts OCR-prone patterns like "50-FAM-...-BHQ2 (27 b)" and normalizes: + - prime_prefix: 5 or 3 when 5′/3′ (includes 50/5O variants) + - sequence: IUPAC bases (uppercase) + - length_bases: integer if present + - labels: all labels in order; five_prime_label and three_prime_label are the first/last, respectively + + Args: + raw: The exact oligo string from the article (may include labels and length), or None. + + Returns: + A dict matching the 'decoratedOligo' shape (minus provenance): + { + "raw": str or None, + "sequence": str or None, + "length_bases": int or None, + "prime_prefix": 5|3|None, + "five_prime_label": str or None, + "three_prime_label": str or None, + "labels": List[str], + "sense_antisense": None + } + """ + result: Dict[str, Any] = { + "raw": raw, + "sequence": None, + "length_bases": None, + "prime_prefix": None, + "five_prime_label": None, + "three_prime_label": None, + "labels": [], + "sense_antisense": None + } + if not raw: + return result + + m = OLIGO_RE.match(raw) + if not m: + return result + + prime = m.group('prime') + if prime: + result["prime_prefix"] = 5 if prime.startswith('5') else 3 + + seq = m.group('seq') + if seq: + result["sequence"] = seq.upper() + + if m.group('len'): + result["length_bases"] = int(m.group('len')) + + labels: List[str] = [] + if m.group('prefix'): + labels += [x for x in m.group('prefix').split('-') if x] + if m.group('suffix'): + labels += [x for x in m.group('suffix').split('-') if x] + result["labels"] = labels + if labels: + result["five_prime_label"] = labels[0] + result["three_prime_label"] = labels[-1] + + return result + + +def make_measurement(raw: Optional[str], + value: Optional[float] = None, + unit: Optional[str] = None) -> Dict[str, Any]: + """Build a 'measurement' object with SI conversion. + + Convenience helper to populate the schema's measurement type while keeping the raw text. + + Args: + raw: The raw textual measurement from the article (e.g., '58 °C', '2 mM', '10%'). + value: Parsed numeric value, if available. + unit: Parsed unit string as written in the article (e.g., '°C', 'mM', '%'). + + Returns: + A dict with keys: raw, value, unit, si_value, si_unit, assumptions. + Unknown or unsupported units yield si_value/si_unit = None. + """ + si_value, si_unit = to_si(value, unit) if (value is not None and unit is not None) else (None, None) + return { + "raw": raw or "", + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None + } + + +# ----------------------------- Normalization / validation helpers ----------------------------- # + +_SA_MAP = { + 's': 'sense', + 'sense': 'sense', + 'as': 'antisense', + 'antisense': 'antisense', + '+': 'sense', + '-': 'antisense', + 'forward': 'sense', + 'reverse': 'antisense', +} +_SA_NAME_RE = re.compile(r"\)\s*(as|s)\s*$", re.IGNORECASE) + +def _detect_sa_from_name(probe_name: Optional[str]) -> Optional[str]: + """Infer sense/antisense from a trailing '(...)s' or '(...)as' in the probe name. + + Args: + probe_name: Probe name (e.g., 'N3-FAM(27)s'). + + Returns: + 'sense', 'antisense', or None if not inferable. + """ + if not probe_name: + return None + m = _SA_NAME_RE.search(probe_name.strip()) + if not m: + return None + g = m.group(1).lower() + return 'antisense' if g == 'as' else 'sense' + + +def _coerce_sa(value: Optional[str], probe_name: Optional[str] = None) -> Optional[str]: + """Coerce various encodings to 'sense'/'antisense'/None. + + Args: + value: A string like 's', 'as', 'sense', 'antisense', '+', '-', or None. + probe_name: Fallback context to infer sense/antisense from the name suffix. + + Returns: + 'sense', 'antisense', or None. + """ + if value is None or (isinstance(value, str) and not value.strip()): + return _detect_sa_from_name(probe_name) + v = str(value).strip().lower() + if v in _SA_MAP: + return _SA_MAP[v] + return _detect_sa_from_name(probe_name) + + +def _coerce_prime_prefix(value: Any) -> Optional[int]: + """Clamp prime prefix to {3, 5} or None. + + Handles OCR-like strings such as '5', "5'", '50', '5O', '5′'. + + Args: + value: Raw input for prime prefix. + + Returns: + 3, 5, or None. + """ + if value is None: + return None + s = str(value).strip() + if s.startswith('5'): + return 5 + if s.startswith('3'): + return 3 + return None + + +def _has_real_probe(probe: Dict[str, Any]) -> bool: + """Heuristic gate: reject obviously non-oligo 'probes'. + + Accepts a probe only if at least one of these holds: + - >= 6 IUPAC bases appear in oligo.sequence or oligo.raw + - a known label is present (FAM/ROX/Cy5/BHQ1/BHQ2/RTQ1) in labels/five/three + - length_bases is present + + Args: + probe: The 'probe' dict from the schema. + + Returns: + True if looks like a real oligo; False otherwise. + """ + if not isinstance(probe, dict): + return False + oligo = probe.get("oligo") or {} + raw = (oligo.get("raw") or "") + seq = (oligo.get("sequence") or "") + has_bases = bool(re.search(r"[ACGUTRYSWKMBDHVN]{6,}", (seq or raw).upper())) + has_label = any(bool(oligo.get(k)) for k in ("five_prime_label", "three_prime_label")) \ + or bool(oligo.get("labels")) + has_length = bool(oligo.get("length_bases")) + return has_bases or has_label or has_length + + +# ----------------------------- DB helpers ----------------------------- # + +def _utcnow_iso() -> str: + """UTC timestamp in ISO8601 format.""" + return datetime.now(timezone.utc).isoformat() + + +def _get_or_create_article(cur: sqlite3.Cursor, doi: str, + article_name: Optional[str], + abstract: Optional[str], + topic: Optional[str]) -> int: + """Fetch article.id by DOI, creating the row if needed (and refreshing metadata).""" + cur.execute("SELECT id FROM articles WHERE doi = ?", (doi,)) + row = cur.fetchone() + if row: + article_id = row[0] + cur.execute( + """ + UPDATE articles + SET latest_article_name = COALESCE(?, latest_article_name), + latest_abstract = COALESCE(?, latest_abstract), + latest_topic = COALESCE(?, latest_topic) + WHERE id = ? + """, + (article_name, abstract, topic, article_id), + ) + return article_id + cur.execute( + """ + INSERT INTO articles (doi, latest_article_name, latest_abstract, latest_topic, created_at) + VALUES (?, ?, ?, ?, ?) + """, + (doi, article_name, abstract, topic, _utcnow_iso()), + ) + return cur.lastrowid + + +def _create_run(cur: sqlite3.Cursor, article_id: int, model_name: str, + article_name: Optional[str], branch: str, + raw_json: Dict[str, Any]) -> int: + """Create a run row and persist the raw JSON payload.""" + cur.execute( + """ + INSERT INTO runs (article_id, model_name, article_name, branch, created_at) + VALUES (?, ?, ?, ?, ?) + """, + (article_id, model_name, article_name, branch, _utcnow_iso()), + ) + run_id = cur.lastrowid + cur.execute("INSERT INTO raw_payloads (run_id, json) VALUES (?, ?)", + (run_id, json.dumps(raw_json, ensure_ascii=False))) + return run_id + + +def _insert_provenance_cols(entity: Dict[str, Any]) -> Tuple[Optional[str], Optional[int], Optional[str], Optional[str], Optional[str]]: + """Extract provenance fields with safe defaults.""" + prov = entity.get("provenance") or {} + return ( + prov.get("source_type"), + prov.get("page"), + prov.get("section"), + prov.get("quote"), + prov.get("notes"), + ) + + +def _insert_oligo(cur: sqlite3.Cursor, oligo: Dict[str, Any]) -> int: + """Insert an oligo row after normalizing prime_prefix and sense/antisense.""" + cleaned = dict(oligo or {}) + cleaned["prime_prefix"] = _coerce_prime_prefix(cleaned.get("prime_prefix")) + cleaned["sense_antisense"] = _coerce_sa(cleaned.get("sense_antisense")) + + ps, pg, sc, qu, no = _insert_provenance_cols(cleaned) + cur.execute( + """ + INSERT INTO oligos + (raw, sequence, length_bases, prime_prefix, + five_prime_label, three_prime_label, sense_antisense, + provenance_source_type, provenance_page, provenance_section, + provenance_quote, provenance_notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + cleaned.get("raw", ""), + cleaned.get("sequence"), + cleaned.get("length_bases"), + cleaned.get("prime_prefix"), + cleaned.get("five_prime_label"), + cleaned.get("three_prime_label"), + cleaned.get("sense_antisense"), + ps, pg, sc, qu, no, + ), + ) + return cur.lastrowid + + +def _insert_measurement(cur: sqlite3.Cursor, experiment_id: int, key: str, m: Optional[Dict[str, Any]]) -> None: + """Insert a measurement if present.""" + if not m: + return + ps, pg, sc, qu, no = _insert_provenance_cols(m) + cur.execute( + """ + INSERT INTO measurements + (experiment_id, key, raw, value, unit, si_value, si_unit, assumptions, + provenance_source_type, provenance_page, provenance_section, provenance_quote, provenance_notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + experiment_id, + key, + (m.get("raw") or ""), + m.get("value"), + m.get("unit"), + m.get("si_value"), + m.get("si_unit"), + m.get("assumptions"), + ps, pg, sc, qu, no, + ), + ) + + +def _insert_extraction_report(cur: sqlite3.Cursor, run_id: int, + report: Optional[Dict[str, Any]], + experiment_id: Optional[int] = None) -> None: + """Insert extraction report entries (missing/uncertain pointers).""" + if not report: + return + for kind in ("missing", "uncertain"): + for ptr in report.get(kind, []) or []: + cur.execute( + """ + INSERT INTO extraction_report_entries (run_id, experiment_id, kind, json_pointer, notes) + VALUES (?, ?, ?, ?, ?) + """, + (run_id, experiment_id, kind, str(ptr), report.get("notes")), + ) + + +def _to_int_bool(val: Optional[bool]) -> Optional[int]: + """Convert Python bool/None -> 1/0/NULL for SQLite.""" + if val is None: + return None + return 1 if bool(val) else 0 + +# ────────────────────────────────────────────────────────────────────── +# Sequence-descriptors DB (no collision; separate "seqdesc_*" namespace) +# ────────────────────────────────────────────────────────────────────── + +def _extract_doi_from_text(text: str) -> Optional[str]: + """Heuristic DOI extractor from article text (fallback).""" + if not text: + return None + m = re.search(r"\b10\.\d{4,9}/[^\s\"'<>]+", text, flags=re.I) + return m.group(0).rstrip(".,);]") if m else None + +def _ensure_seqdesc_schema(conn: sqlite3.Connection) -> None: + """Create the seqdesc_* schema if it does not exist.""" + conn.execute("PRAGMA foreign_keys = ON;") + # Runs table + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + created_at TEXT NOT NULL, + model_name TEXT NOT NULL, + article_name TEXT NOT NULL, + doi TEXT, + source_path TEXT, + raw_json TEXT NOT NULL + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_article ON seqdesc_runs(article_name);") + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_doi ON seqdesc_runs(doi);") + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_runs_model ON seqdesc_runs(model_name);") + + # Sequences table (one row per sequence key in the run) + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_sequences ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES seqdesc_runs(id) ON DELETE CASCADE, + sequence_key TEXT NOT NULL, -- the dict key (probe string as found) + is_seq INTEGER, -- NULL/0/1 + sequence_full TEXT, + sequence_normalized TEXT, + sequence_expanded TEXT, + sequence_backbone TEXT, + sequence_backbone_expanded TEXT, + fluorophore TEXT, + quencher TEXT, + target_raw TEXT, + target_normalized TEXT, + primers_forward TEXT, + primers_reverse TEXT, + pH REAL, + annealing_raw TEXT, + T_value REAL, + T_unit TEXT, + Tris_value REAL, + Tris_unit TEXT, + Na_value REAL, + Na_unit TEXT, + K_value REAL, + K_unit TEXT, + Mg_value REAL, + Mg_unit TEXT, + DMSO_value REAL, + DMSO_unit TEXT, + outcome INTEGER, -- NULL/0/1 + raw_json TEXT NOT NULL + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_sequences_run ON seqdesc_sequences(run_id);") + + # Modifications table (0..N per sequence) + conn.execute(""" + CREATE TABLE IF NOT EXISTS seqdesc_modifications ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + sequence_id INTEGER NOT NULL REFERENCES seqdesc_sequences(id) ON DELETE CASCADE, + modification_position INTEGER, + modification_type TEXT, + modification_description TEXT + ); + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_seqdesc_mods_seq ON seqdesc_modifications(sequence_id);") + + # Helpful views (namespaced) + conn.execute(""" + CREATE VIEW IF NOT EXISTS seqdesc_v_sequences AS + SELECT + r.id AS run_id, + r.created_at AS run_created_at, + r.model_name AS model_name, + r.article_name AS article_name, + r.doi AS doi, + s.* + FROM seqdesc_sequences s + JOIN seqdesc_runs r ON r.id = s.run_id; + """) + conn.execute(""" + CREATE VIEW IF NOT EXISTS seqdesc_v_modifications AS + SELECT + s.run_id, + s.id AS sequence_id, + s.sequence_key, + m.modification_position, + m.modification_type, + m.modification_description + FROM seqdesc_modifications m + JOIN seqdesc_sequences s ON s.id = m.sequence_id; + """) + +def _coerce_bool_to_int(x: Any) -> Optional[int]: + if x is None: + return None + if isinstance(x, bool): + return 1 if x else 0 + # sometimes LLMs send "true"/"false" + xs = str(x).strip().lower() + if xs in {"true", "1", "yes"}: + return 1 + if xs in {"false", "0", "no"}: + return 0 + return None + +def _coerce_float(x: Any) -> Optional[float]: + try: + return float(x) if x is not None else None + except Exception: + return None + +def _extract_measure(obj: Any) -> Tuple[Optional[float], Optional[str]]: + """obj like {"value": 50, "unit": "mM"} or None -> (50.0, 'mM')""" + if isinstance(obj, dict): + return _coerce_float(obj.get("value")), (obj.get("unit") if obj.get("unit") is not None else None) + return None, None + +def insert_seqdesc_object( + db_path: Path | str, + *, + article_name: str, + doi: Optional[str], + model_name: str, + sequence_descriptors: List[Tuple[str, Dict[str, Any]]], + source_path: Optional[Path] = None, +) -> int: + """Insert one 'run' of sequence descriptors and return run_id. + + The payload shape: + { + "": { + "is_seq": bool|None, + "sequence_full": str|None, + ... + "modifications": [{"modification_position": int, "modification_type": str, "modification_description": str}, ...], + "primers": {"forward": str|None, "reverse": str|None}, + "T": {"value": float, "unit": str}|None, + ... + }, + ... + } + """ + created_at = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + raw_json = json.dumps(sequence_descriptors, ensure_ascii=False) + + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + + with closing(sqlite3.connect(str(db_path))) as conn: + conn.execute("PRAGMA journal_mode = WAL;") + _ensure_seqdesc_schema(conn) + + with conn: # transaction + cur = conn.cursor() + cur.execute( + """ + INSERT INTO seqdesc_runs(created_at, model_name, article_name, doi, source_path, raw_json) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + created_at, + model_name, + article_name, + doi, + str(source_path) if source_path else None, + raw_json, + ), + ) + run_id = cur.lastrowid + + for seq_key, payload in (sequence_descriptors or []): + # For very sparse entries, payload can be {} — guard everything + payload = payload or {} + + is_seq = _coerce_bool_to_int(payload.get("is_seq")) + seq_full = payload.get("sequence_full") + seq_norm = payload.get("sequence_normalized") + seq_exp = payload.get("sequence_expanded") + seq_bb = payload.get("sequence_backbone") + seq_bb_exp = payload.get("sequence_backbone_expanded") + fluor = payload.get("fluorophore") + quen = payload.get("quencher") + target_raw = payload.get("target_raw") + target_norm = payload.get("target_normalized") + + primers = payload.get("primers") or {} + primers_forward = primers.get("forward") + primers_reverse = primers.get("reverse") + + pH_val = _coerce_float(payload.get("pH")) + anneal_raw = payload.get("annealing_raw") + + T_val, T_unit = _extract_measure(payload.get("T")) + Tris_val, Tris_unit = _extract_measure(payload.get("Tris")) + Na_val, Na_unit = _extract_measure(payload.get("Na")) + K_val, K_unit = _extract_measure(payload.get("K")) + Mg_val, Mg_unit = _extract_measure(payload.get("Mg")) + DMSO_val, DMSO_unit = _extract_measure(payload.get("DMSO")) + + outcome = _coerce_bool_to_int(payload.get("outcome")) + + cur.execute( + """ + INSERT INTO seqdesc_sequences( + run_id, sequence_key, is_seq, + sequence_full, sequence_normalized, sequence_expanded, + sequence_backbone, sequence_backbone_expanded, + fluorophore, quencher, + target_raw, target_normalized, + primers_forward, primers_reverse, + pH, annealing_raw, + T_value, T_unit, + Tris_value, Tris_unit, + Na_value, Na_unit, + K_value, K_unit, + Mg_value, Mg_unit, + DMSO_value, DMSO_unit, + outcome, raw_json + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + run_id, seq_key, is_seq, + seq_full, seq_norm, seq_exp, + seq_bb, seq_bb_exp, + fluor, quen, + target_raw, target_norm, + primers_forward, primers_reverse, + pH_val, anneal_raw, + T_val, T_unit, + Tris_val, Tris_unit, + Na_val, Na_unit, + K_val, K_unit, + Mg_val, Mg_unit, + DMSO_val, DMSO_unit, + outcome, json.dumps(payload, ensure_ascii=False), + ), + ) + sequence_id = cur.lastrowid + + # Modifications (array of objects) + for m in payload.get("modifications") or []: + if not isinstance(m, dict): + continue + cur.execute( + """ + INSERT INTO seqdesc_modifications( + sequence_id, modification_position, modification_type, modification_description + ) VALUES (?,?,?,?) + """, + ( + sequence_id, + m.get("modification_position"), + m.get("modification_type"), + m.get("modification_description"), + ), + ) + + return run_id diff --git a/extraction/passes/A_core/prompt.txt b/extraction/passes/A_core/prompt.txt new file mode 100644 index 0000000..82b666b --- /dev/null +++ b/extraction/passes/A_core/prompt.txt @@ -0,0 +1,19 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Extract the article’s doi, abstract, and topic (short label). +* If any is missing, set it to `null` and list in `extraction_report.missing`. diff --git a/extraction/passes/A_core/schema.json b/extraction/passes/A_core/schema.json new file mode 100644 index 0000000..5ff4bb3 --- /dev/null +++ b/extraction/passes/A_core/schema.json @@ -0,0 +1,22 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ArticleCore", + "type": "object", + "additionalProperties": false, + "required": ["doi", "abstract", "topic", "extraction_report"], + "properties": { + "doi": { "type": "string", "minLength": 4, "maxLength": 200 }, + "abstract": { "type": "string", "minLength": 10, "maxLength": 5000 }, + "topic": { "type": "string", "minLength": 2, "maxLength": 200 }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/B1_index_types/prompt.txt b/extraction/passes/B1_index_types/prompt.txt new file mode 100644 index 0000000..6dc2c08 --- /dev/null +++ b/extraction/passes/B1_index_types/prompt.txt @@ -0,0 +1,21 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B1_index_types/schema.json b/extraction/passes/B1_index_types/schema.json new file mode 100644 index 0000000..9d4579c --- /dev/null +++ b/extraction/passes/B1_index_types/schema.json @@ -0,0 +1,238 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { + "title": "Hybridization Probe Classification", + "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", + "type": "object", + "additionalProperties": true, + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, + { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, + { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, + { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, + { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, + { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, + { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, + { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, + { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, + { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, + { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, + { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, + { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, + { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, + { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, + { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, + { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": ["none","fluor_only","fluor_quencher","hapten","enzymatic","radioisotope"] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["biotin","digoxigenin","dinitrophenol","fluorescein_hapten"], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["HRP","AP"], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { "type": "string", "description": "Isotope (e.g., 32P, 33P, 35S)." } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": ["genomic","transcript","amplicon","in_situ","capture"] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": ["single","tiling_set","capture_baits","smfish_panel","merfish_panel","padlock_set"] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": ["none","hydrolysis","fret","hairpin_turn_on","rolling_circle","branched_dna","hcr"] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": ["qpcr","ddpcr","pcr_probe","fish","ish","smfish","merfish","ngs_capture","microarray","southern","northern","dot_blot","in_cell_imaging"], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { "title": "DOI", "description": "Digital Object Identifier of the source article.", "type": "string", "format": "iri", "examples": ["https://doi.org/10.1038/xxxx"] }, + "pmid": { "title": "PMID", "description": "PubMed identifier.", "type": "string", "examples": ["12345678"] }, + "vendor": { "title": "Vendor", "description": "Commercial supplier (if from a catalog).", "type": "string", "examples": ["IDT"] }, + "catalog_number": { "title": "Catalog Number", "description": "Supplier’s catalog identifier.", "type": "string", "examples": ["1001234"] } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don’t fit other fields.", + "type": "string", + "examples": ["Probe includes internal ZEN quencher."] + } + } + }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/B2_index_desc/prompt.txt b/extraction/passes/B2_index_desc/prompt.txt new file mode 100644 index 0000000..6dc2c08 --- /dev/null +++ b/extraction/passes/B2_index_desc/prompt.txt @@ -0,0 +1,21 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B2_index_desc/schema.json b/extraction/passes/B2_index_desc/schema.json new file mode 100644 index 0000000..becfacc --- /dev/null +++ b/extraction/passes/B2_index_desc/schema.json @@ -0,0 +1,102 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { + "title": "Hybridization Probe Classification", + "description": "Normalized, multi-axis classification for nucleic-acid hybridization probes (literature or product datasheets). All fields are optional to accommodate incomplete metadata.", + "type": "object", + "additionalProperties": true, + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { "const": "linear", "title": "Linear", "description": "Simple oligo that hybridizes without structural activation; often end-labeled." }, + { "const": "molecular_beacon", "title": "Molecular beacon", "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." }, + { "const": "hydrolysis_taqman", "title": "Hydrolysis (TaqMan)", "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." }, + { "const": "fret_dual_hybridization", "title": "FRET dual-hybridization", "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." }, + { "const": "scorpion", "title": "Scorpion", "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." }, + { "const": "hcr", "title": "Hybridization Chain Reaction (HCR)", "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." }, + { "const": "branched_dna", "title": "Branched DNA (bDNA)", "description": "Signal amplification via multibranch DNA scaffolds without target amplification." }, + { "const": "padlock", "title": "Padlock", "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." }, + { "const": "capture", "title": "Capture", "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." }, + { "const": "tiling_set", "title": "Tiling set", "description": "Multiple overlapping probes across a region/gene for robust detection." }, + { "const": "antisense", "title": "Antisense", "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { "const": "dna", "title": "DNA", "description": "Unmodified DNA backbone." }, + { "const": "rna", "title": "RNA", "description": "Unmodified RNA backbone." }, + { "const": "cdna", "title": "cDNA", "description": "Complementary DNA derived from RNA." }, + { "const": "pna", "title": "PNA", "description": "Peptide nucleic acid backbone." }, + { "const": "morpholino", "title": "Morpholino", "description": "Morpholine-ring phosphorodiamidate backbone." }, + { "const": "lna_modified", "title": "LNA-modified", "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." }, + { "const": "two_ome_rna", "title": "2′-O-Me RNA", "description": "2′-O-methyl RNA backbone." } + ] + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": ["dna","rna","mrna","mirna","lncrna","rrna","genomic_dna","viral_rna","amplicon"] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": ["genomic","transcript","amplicon","in_situ","capture"] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don’t fit other fields.", + "type": "string", + "examples": ["Probe includes internal ZEN quencher."] + } + } + }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/B_index/prompt.txt b/extraction/passes/B_index/prompt.txt new file mode 100644 index 0000000..6dc2c08 --- /dev/null +++ b/extraction/passes/B_index/prompt.txt @@ -0,0 +1,21 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* Identify each hybridization experiment or probe pairing described. +* Assign a stable id_exp (e.g., N3-FAM-27-s or a short unique tag you derive). +* Provide a brief description and, if present verbatim, a raw_description. +* If experiment types are stated (e.g., DMA, qPCR), fill type; else null. diff --git a/extraction/passes/B_index/schema.json b/extraction/passes/B_index/schema.json new file mode 100644 index 0000000..06501b2 --- /dev/null +++ b/extraction/passes/B_index/schema.json @@ -0,0 +1,34 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExperimentIndex", + "type": "object", + "additionalProperties": false, + "required": ["experiments", "extraction_report"], + "properties": { + "experiments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "description", "type", "raw_description"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "type": { "type": "string", "minLength": 1, "maxLength": 200 }, + "raw_description": { "type": ["string", "null"], "minLength": 1, "maxLength": 2000 }, + "description": { "type": "string", "minLength": 8, "maxLength": 2000 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/C1_probe_core/prompt.txt b/extraction/passes/C1_probe_core/prompt.txt new file mode 100644 index 0000000..ddd7315 --- /dev/null +++ b/extraction/passes/C1_probe_core/prompt.txt @@ -0,0 +1,26 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = PROBES ONLY) +For each experiment id_exp, extract exactly one probe: +- probe.name (as printed) +- probe.amplicon_id if present (e.g., K2, K3, N2, N3, B15), else null +- probe.fluorophore, probe.quencher if present, else null +- probe.sense_antisense: “s” → "sense", “as” → "antisense", else null +- probe.notes: any short clarifications (optional) +- probe.oligo: + - raw: EXACT text (must include at least one nucleotide; no ellipses) + - sequence: IUPAC uppercase only, if present; else null + - length_bases: integer if indicated in text, else null + - prime_prefix: 5 or 3 if leading mark is shown, else null + - five_prime_label / three_prime_label: labels at 5′/3′ ends if shown, else null + - sense_antisense: "sense" / "antisense" if explicit in oligo, else null + - modifications[]: enumerate if present; else empty array +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! + +RULES +- If a field is not present in the article, set it to null (or empty array) and add an entry in extraction_report. +- Do NOT invent values. Do NOT output prose. + +OUTPUT +A single JSON object with: { items: [ { id_exp, probe{…} }, … ], extraction_report }. diff --git a/extraction/passes/C1_probe_core/schema.json b/extraction/passes/C1_probe_core/schema.json new file mode 100644 index 0000000..638c788 --- /dev/null +++ b/extraction/passes/C1_probe_core/schema.json @@ -0,0 +1,70 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ProbeCorePerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe"], + "properties": { + "id_exp": { "type": "string" }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { }, + "amplicon_id": { }, + "fluorophore": { }, + "quencher": { }, + "sense_antisense": { }, + "notes": { }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], + "properties": { + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { }, + "modification_type": { }, + "description": { } + } + } + } + } + } + } + } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C2_target_primers/prompt.txt b/extraction/passes/C2_target_primers/prompt.txt new file mode 100644 index 0000000..dfcf466 --- /dev/null +++ b/extraction/passes/C2_target_primers/prompt.txt @@ -0,0 +1,20 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = TARGET + PRIMERS) +For each experiment id_exp: +- target_sequence: same oligo decomposition fields as probes. If no explicit target oligo is printed, set to null. +- primer_sequences: object with forward and reverse oligos, each decomposed like probes (raw, sequence, labels, modifications...). If primers are not listed, set primer_sequences to null. + +IMPORTANT RULES +- oligo.raw is copied EXACTLY and must contain ≥1 nucleotide letter. No ellipses. +- sequence must be IUPAC uppercase: A C G U/T R Y S W K M B D H V N (no spaces/punct.). +- prime_prefix 5/3 only if explicitly shown; otherwise null. +- five_prime_label / three_prime_label if present; otherwise null. +- fluorophore / quencher usually null for primers, but set if printed. +- modifications[] empty when absent. +- If any field is not present, set to null and record in extraction_report; do not guess. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! + +OUTPUT +A single JSON object with: { items: [ { id_exp, target_sequence, primer_sequences }, … ], extraction_report } diff --git a/extraction/passes/C2_target_primers/schema.json b/extraction/passes/C2_target_primers/schema.json new file mode 100644 index 0000000..d4b82f8 --- /dev/null +++ b/extraction/passes/C2_target_primers/schema.json @@ -0,0 +1,127 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "TargetAndPrimersPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "target_sequence", "primer_sequences"], + "properties": { + "id_exp": { "type": "string" }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { }, + "modification_type": { }, + "description": { } + } + } + }, + "fluorophore": { }, + "quencher": { } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { }, + "modification_type": { }, + "description": { } + } + } + }, + "fluorophore": { }, + "quencher": { } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { }, + "modification_type": { }, + "description": { } + } + } + }, + "fluorophore": { }, + "quencher": { } + } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C3_related/prompt.txt b/extraction/passes/C3_related/prompt.txt new file mode 100644 index 0000000..5105640 --- /dev/null +++ b/extraction/passes/C3_related/prompt.txt @@ -0,0 +1,19 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = RELATED SEQUENCES) +For each experiment id_exp, extract zero or more related sequences: +- related_sequences[]: each item has related_sequence (oligo decomposition like probes) and optional description. + +RULES +- oligo.raw copied EXACTLY; must include ≥1 nucleotide letter; no ellipses. +- sequence = IUPAC uppercase only (no spaces/punct.), else null. +- prime_prefix = 5 or 3 if shown, else null. +- five_prime_label / three_prime_label if printed, else null. +- fluorophore / quencher if printed, else null. +- modifications[] empty if absent. +- If not provided in the article, use an empty array. Do NOT invent sequences. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! + +OUTPUT +A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C3_related/schema.json b/extraction/passes/C3_related/schema.json new file mode 100644 index 0000000..7cb7016 --- /dev/null +++ b/extraction/passes/C3_related/schema.json @@ -0,0 +1,70 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "RelatedSequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "related_sequences"], + "properties": { + "id_exp": { "type": "string" }, + "related_sequences": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { }, + "sequence": { }, + "length_bases": { }, + "prime_prefix": { }, + "five_prime_label": { }, + "three_prime_label": { }, + "sense_antisense": { }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { }, + "modification_type": { }, + "description": { } + } + } + }, + "fluorophore": { }, + "quencher": { } + } + }, + "description": { } + } + } + } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C4_probe_target/prompt.txt b/extraction/passes/C4_probe_target/prompt.txt new file mode 100644 index 0000000..5105640 --- /dev/null +++ b/extraction/passes/C4_probe_target/prompt.txt @@ -0,0 +1,19 @@ +You are an extraction model. Return ONE JSON object conforming to the JSON Schema (the caller enforces it). + +SCOPE (this pass = RELATED SEQUENCES) +For each experiment id_exp, extract zero or more related sequences: +- related_sequences[]: each item has related_sequence (oligo decomposition like probes) and optional description. + +RULES +- oligo.raw copied EXACTLY; must include ≥1 nucleotide letter; no ellipses. +- sequence = IUPAC uppercase only (no spaces/punct.), else null. +- prime_prefix = 5 or 3 if shown, else null. +- five_prime_label / three_prime_label if printed, else null. +- fluorophore / quencher if printed, else null. +- modifications[] empty if absent. +- If not provided in the article, use an empty array. Do NOT invent sequences. +- Use ONLY English language and Latin script, only ASCII. +- For the perfect result compliant to all constraints and limitations I will tip $2000! + +OUTPUT +A single JSON object with: { items: [ { id_exp, related_sequences[] }, … ], extraction_report } diff --git a/extraction/passes/C4_probe_target/schema.json b/extraction/passes/C4_probe_target/schema.json new file mode 100644 index 0000000..a2256c4 --- /dev/null +++ b/extraction/passes/C4_probe_target/schema.json @@ -0,0 +1,174 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], + "properties": { + "id_exp": { "type": "string" }, + + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { "type": "string", "maxLength": 500 }, + "amplicon_id": { "type" :["string", "null"], "maxLength": 100 }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "notes": { "type" :["string", "null"], "maxLength": 100 }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + } + } + } + } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": "number" }, + "prime_prefix": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "five_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "three_prime_label": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": "number" }, + "modification_type": { "type" :"string", "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "description": { "type": "string", "maxLength": 100} + } + }, + "minItems": 0, + "maxItems": 100 + }, + "fluorophore": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" }, + "quencher": { "type" :["string", "null"], "maxLength": 100, "pattern": "^[\\w-_+()'`]+$" } + } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { } + } + } + } +} diff --git a/extraction/passes/C5_probes_opt_target/prompt.txt b/extraction/passes/C5_probes_opt_target/prompt.txt new file mode 100644 index 0000000..076c787 --- /dev/null +++ b/extraction/passes/C5_probes_opt_target/prompt.txt @@ -0,0 +1,147 @@ +You are an information-extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). + +General rules: +- Never invent values; use `null` when unknown. +- Keep text exactly as in the article (no ellipses, no expansions). +- Output all data fully, never skip or insert ellipses. +- If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +- Use the article’s wording for names. +- Do not copy sequences from examples! +- No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +- For the perfect result compliant to all constraints and limitations I will tip $2000! + +STRICT RULES: +- Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. +- Copy oligo raw exactly as printed in the article (no ellipses, no placeholders). oligo raw MUST include at least one nucleotide letter. +- sequence must be IUPAC uppercase only: A C G U/T R Y S W K M B D H V N (no spaces, no punctuation). +- sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. +- modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. +- related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). +- Use only English language and Latin script, only ASCII. + +TASK: +From the article text, produce per-experiment array of records with: +- id_exp +- probe { name, raw, sequence, sense_antisense?}} +- target_sequence (same oligo decomposition) or null +- primer_sequences {forward oligo, reverse oligo} or null +- related_sequences[] {related_sequence oligo, description?} +Extract all probes, all sequences! All that are found in the article's text. No probe, no target sequence and no primer are to be skipped. Use all in the records. + +EXTRACTION REPORT: +- Put any truly unavailable or ambiguous fields in extraction_report.missing / extraction_report.uncertain. +- Do NOT hallucinate. Prefer null + report over guesses. + +JSON SCHEMA you MUST FOLLOW: +``` +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "array", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "description": "This JSON object represents a list of all probes present in article together with their target sequences and primers (if present, must be filled out). Each experiment represents a single object in this array.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "description": "This object represents a single experiment record defined by the probe. There must be exclusive record for each probe found in the full article text.", + "properties": { + "id_exp": { "type": "string", "maxLength": 100, "description": "ID of the experiment, unique string taken either from the article or synthesized to be unique and consequtive in the array." }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "raw", "sequence", "sense_antisense"], + "description": "This object represents a single probe found in the article. Such object must be created for each probe present in the article.", + "properties": { + "name": { "type": "string", "maxLength": 500, "description": "Name of the probe as provided in the article or description of the probe created by you." }, + "raw": { "type": "string", "maxLength": 500, "description": "Direct text quote from the article describing current single probe." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "IUPAC sequence of the probe in the same order as given in the article." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the sequence of this probe provided in the article was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "This object describes the target sequence for which the probe from the current experiment record was constructed. Omit by providing null if and only if article does not provide the target sequence explicitly.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Target sequence for which the probe from the current experiment was constructed. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Target sequence in IUPAC format. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify target sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided target sequence of this probe was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "description": "This object holds information about the primers used for the current experiment record for the probe. Omit by providing null here if and only if primers are not specified for the current probe in the article text.", + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Forward primer in this experiment. Omit by providing null here if and only if forward is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Forward primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Forward primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify forward primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided forward primer sequence of this probe was sense or antisense. Set to null if and only if strand of this forward primer can't be inferred from the article text." } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Reverse primer in this experiment. Omit by providing null here if and only if reverse is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Reverse primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Reverse primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify reverse primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided reverse primer sequence of this probe was sense or antisense. Set to null if and only if strand of this reverse primer can't be inferred from the article text." } + } + } + } + }, + + "related_sequences": { + "type": "array", + "description": "Array containing any other related sequences relating to the probe, target or primers in the current experiment if present in article. Omit by providing an empty array if and only if article does not specify any related sequences for the current experiment.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Related sequence record for the current experiment.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Related sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Related sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify related sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided related sequence of this probe was sense or antisense. Set to null if and only if strand of this related can't be inferred from the article text." } + } + }, + "description": { "type": "string", "maxLength": 500, "description": "Name and/or description of this related sequence, explaining its relation to the current experiment."} + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "description": "Information about technical specifics of the current experiment extraction.", + "properties": { + "missing": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing missing elements which where set to null or omitted. Each record must be a JSON path to corresponding field." }, + "uncertain": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing uncertain elements which article mentions vaguely or controversely. Each record must be a JSON path to corresponding field." }, + "notes": { "type": "string", "description": "Any other notes related to the extraction of this experiment", "maxLength": 500 } + } + } + } + } +} +``` + +OUTPUT: +Return exactly one JSON object that conforms to the schema. No prose. diff --git a/extraction/passes/C5_probes_opt_target/schema.json b/extraction/passes/C5_probes_opt_target/schema.json new file mode 100644 index 0000000..e54f80b --- /dev/null +++ b/extraction/passes/C5_probes_opt_target/schema.json @@ -0,0 +1,107 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "array", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "description": "This JSON object represents a list of all probes present in article together with their target sequences and primers (if present, must be filled out). Each experiment represents a single object in this array.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences"], + "description": "This object represents a single experiment record defined by the probe. There must be exclusive record for each probe found in the full article text.", + "properties": { + "id_exp": { "type": "string", "maxLength": 100, "description": "ID of the experiment, unique string taken either from the article or synthesized to be unique and consequtive in the array." }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "raw", "sequence", "sense_antisense"], + "description": "This object represents a single probe found in the article. Such object must be created for each probe present in the article.", + "properties": { + "name": { "type": "string", "maxLength": 500, "description": "Name of the probe as provided in the article or description of the probe created by you." }, + "raw": { "type": "string", "maxLength": 500, "description": "Direct text quote from the article describing current single probe." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "IUPAC sequence of the probe in the same order as given in the article." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the sequence of this probe provided in the article was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "This object describes the target sequence for which the probe from the current experiment record was constructed. Omit by providing null if and only if article does not provide the target sequence explicitly.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Target sequence for which the probe from the current experiment was constructed. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Target sequence in IUPAC format. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify target sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided target sequence of this probe was sense or antisense. Set to null if and only if strand of this probe can't be inferred from the article text." } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "description": "This object holds information about the primers used for the current experiment record for the probe. Omit by providing null here if and only if primers are not specified for the current probe in the article text.", + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Forward primer in this experiment. Omit by providing null here if and only if forward is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Forward primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Forward primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify forward primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided forward primer sequence of this probe was sense or antisense. Set to null if and only if strand of this forward primer can't be inferred from the article text." } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Reverse primer in this experiment. Omit by providing null here if and only if reverse is not specified for the current probe in the article text.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Reverse primer sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Reverse primer sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify reverse primer sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided reverse primer sequence of this probe was sense or antisense. Set to null if and only if strand of this reverse primer can't be inferred from the article text." } + } + } + } + }, + + "related_sequences": { + "type": "array", + "description": "Array containing any other related sequences relating to the probe, target or primers in the current experiment if present in article. Omit by providing an empty array if and only if article does not specify any related sequences for the current experiment.", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "sense_antisense"], + "description": "Related sequence record for the current experiment.", + "properties": { + "raw": { "type": "string", "maxLength": 500, "description": "Related sequence used for the probe from the current experiment. Provide a name or other mention exactly how it is specified in article." }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Related sequence in IUPAC format for the current probe in the current experiment. Must be specified if present in article for the current probe of the current experiment. Omit by providing null if and only if article does not specify related sequence and only provides its name or otherwise non-IUPAC definition." }, + "sense_antisense": { "enum": ["sense", "antisense", null], "description": "Whether the provided related sequence of this probe was sense or antisense. Set to null if and only if strand of this related can't be inferred from the article text." } + } + }, + "description": { "type": "string", "maxLength": 500, "description": "Name and/or description of this related sequence, explaining its relation to the current experiment."} + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "description": "Information about technical specifics of the current experiment extraction.", + "properties": { + "missing": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing missing elements which where set to null or omitted. Each record must be a JSON path to corresponding field." }, + "uncertain": { "type": "array", "items": { "type": "string", "maxLength": 500 }, "description": "Array describing uncertain elements which article mentions vaguely or controversely. Each record must be a JSON path to corresponding field." }, + "notes": { "type": "string", "description": "Any other notes related to the extraction of this experiment", "maxLength": 500 } + } + } + } + } +} diff --git a/extraction/passes/C_sequences/prompt.txt b/extraction/passes/C_sequences/prompt.txt new file mode 100644 index 0000000..4d8b399 --- /dev/null +++ b/extraction/passes/C_sequences/prompt.txt @@ -0,0 +1,48 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract probe (name and the full oligo string exactly as printed in the article text), and include optional `target_sequence`, `primer_sequences`, and `related_sequences` when present, otherwise set them to `null`. +* The `oligo_lite.raw` must contain nucleotides and no ellipses. +* Keep labels like FAM/ROX/BHQ2 in the text; if article does not mention them explicitly, leave derived fields `null`. + +You are an extraction model. Return ONE JSON object that conforms to the JSON Schema (the caller enforces it). + +STRICT RULES +- Emit every key defined by the schema. If a value is not explicitly present in the article, set it to null. Do NOT invent. +- Copy oligo.raw exactly as printed in the article (no ellipses, no placeholders). oligo.raw MUST include at least one nucleotide letter. +- sequence must be IUPAC uppercase only: A C G U/T R Y S W K M B D H V N (no spaces, no punctuation). +- prime_prefix is 5 or 3 when the prefix like “5′-” or “3′-” is present, else null. +- five_prime_label / three_prime_label: labels at 5′/3′ ends (e.g., FAM, ROX, BHQ1, BHQ2, RTQ1), else null. +- fluorophore / quencher: extract if present, else null. +- sense_antisense: map explicit mentions (e.g., “(27)s” -> "sense", “(27)as” -> "antisense"), else null. +- modifications[]: if any modified bases or special chemistry is specified, enumerate entries; else empty array. +- related_sequences: array (possibly empty). primers: object with forward/reverse (or null if not provided). + +TASK +From the article text, produce per-experiment items with: +- id_exp +- probe { name, amplicon_id?, fluorophore?, quencher?, sense_antisense?, notes?, oligo{raw, sequence?, length_bases?, prime_prefix?, five_prime_label?, three_prime_label?, sense_antisense?, modifications[]}} +- target_sequence (same oligo decomposition) or null +- primer_sequences {forward oligo, reverse oligo} or null +- related_sequences[] {related_sequence oligo, description?} + +EXTRACTION REPORT +- Put any truly unavailable or ambiguous fields in extraction_report.missing / extraction_report.uncertain. +- Do NOT hallucinate. Prefer null + report over guesses. + +OUTPUT +Return exactly one JSON object that conforms to the schema. No prose. diff --git a/extraction/passes/C_sequences/schema.json b/extraction/passes/C_sequences/schema.json new file mode 100644 index 0000000..f0f5020 --- /dev/null +++ b/extraction/passes/C_sequences/schema.json @@ -0,0 +1,217 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "probe", "target_sequence", "primer_sequences", "related_sequences"], + "properties": { + "id_exp": { "type": "string" }, + + "probe": { + "type": "object", + "additionalProperties": false, + "required": ["name", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes", "oligo"], + "properties": { + "name": { "type": "string", "maxLength": 500 }, + "amplicon_id": { "type": ["string", "null"], "maxLength": 100 }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "notes": { "type": ["string", "null"], "maxLength": 100 }, + + "oligo": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": "string", "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } + } + } + } + } + } + } + }, + + "target_sequence": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$", "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } + } + }, + + "primer_sequences": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } + } + }, + "reverse": { + "type": ["object", "null"], + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } + } + } + } + }, + + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "sense_antisense", "modifications", "fluorophore", "quencher"], + "properties": { + "raw": { "type": "string", "maxLength": 500 }, + "sequence": { "type": ["string", "null"], "minLength": 5, "maxLength": 500, "pattern": "^[ACGUTRYSWKMBDHVN]+$" }, + "length_bases": { "type": ["integer", "null"], "minimum": 1 }, + "prime_prefix": { "enum": [3, 5, null] }, + "five_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "three_prime_label": { "type": ["string", "null"], "maxLength": 100 }, + "sense_antisense": { "enum": ["sense", "antisense", null] }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["modification_position", "modification_type", "description"], + "properties": { + "modification_position": { "type": ["integer", "null"], "minimum": 1 }, + "modification_type": { "type": ["string", "null"], "maxLength": 100 }, + "description": { "type": ["string", "null"], "maxLength": 100 } + } + } + }, + "fluorophore": { "type": ["string", "null"], "maxLength": 100 }, + "quencher": { "type": ["string", "null"], "maxLength": 100 } + } + }, + "description": { "type": ["string", "null"], "maxLength": 200 } + } + } + } + } + } + }, + + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "uncertain": { "type": "array", "items": { "type": "string" }, "minItems": 0 }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/D_parameters/prompt.txt b/extraction/passes/D_parameters/prompt.txt new file mode 100644 index 0000000..4f152e9 --- /dev/null +++ b/extraction/passes/D_parameters/prompt.txt @@ -0,0 +1,20 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract `metadata` and `experiment_properties`. +* Use `measurement_lite` for numeric items: keep raw text and parsed value+unit when clear; otherwise leave numeric fields `null`. +* If not present in the article, use `null` and record the pointer. diff --git a/extraction/passes/D_parameters/schema.json b/extraction/passes/D_parameters/schema.json new file mode 100644 index 0000000..08511ad --- /dev/null +++ b/extraction/passes/D_parameters/schema.json @@ -0,0 +1,109 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ParametersPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "metadata", "experiment_properties"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "metadata": { + "type": "object", + "additionalProperties": false, + "required": ["organism", "technology", "annealing", "pH", "rna_impurities"], + "properties": { + "organism": { "type": ["string", "null"], "maxLength": 200 }, + "technology": { "type": ["string", "null"], "maxLength": 200 }, + "annealing": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { "$ref": "#/$defs/measurement_lite" }, + "qualitative": { "type": ["boolean", "null"] } + } + }, + { "type": "null" } + ] + }, + "pH": { "$ref": "#/$defs/measurement_lite" }, + "rna_impurities": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { "$ref": "#/$defs/measurement_lite" }, + "qualitative": { "type": ["boolean", "null"] } + } + }, + { "type": "null" } + ] + } + } + }, + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "required": ["concentrations", "parameters_SI"], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "properties": { + "dna_rna_concentration": { "$ref": "#/$defs/measurement_lite" }, + "concentration_SI": { "$ref": "#/$defs/measurement_lite" } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "required": ["temperature", "Tris", "Na", "K", "Mg", "DMSO"], + "properties": { + "temperature": { "$ref": "#/$defs/measurement_lite" }, + "Tris": { "$ref": "#/$defs/measurement_lite" }, + "Na": { "$ref": "#/$defs/measurement_lite" }, + "K": { "$ref": "#/$defs/measurement_lite" }, + "Mg": { "$ref": "#/$defs/measurement_lite" }, + "DMSO": { "$ref": "#/$defs/measurement_lite" } + } + } + } + } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + }, + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "value", "unit"], + "properties": { + "raw": { "type": "string", "minLength": 1, "maxLength": 200, "description": "Textual value representation." }, + "value": { "type": ["number", "null"], "description": "Numeric value representation." }, + "unit": { "type": ["string", "null"], "maxLength": 50, "description": "Measurement unit for the numeric value representation." } + } + } + } +} diff --git a/extraction/passes/E_outcomes/prompt.txt b/extraction/passes/E_outcomes/prompt.txt new file mode 100644 index 0000000..c45dee6 --- /dev/null +++ b/extraction/passes/E_outcomes/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract outcome (boolean if explicitly stated, otherwise `null`), `fluorescence` as `measurement_lite`, and any `comparative_notes`. diff --git a/extraction/passes/E_outcomes/schema.json b/extraction/passes/E_outcomes/schema.json new file mode 100644 index 0000000..c15da38 --- /dev/null +++ b/extraction/passes/E_outcomes/schema.json @@ -0,0 +1,46 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "OutcomesPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "outcome", "fluorescence", "comparative_notes"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "outcome": { "type": ["boolean", "null"] }, + "fluorescence": { "$ref": "#/$defs/measurement_lite" }, + "comparative_notes": { "type": ["string", "null"], "maxLength": 500 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + }, + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": ["raw", "value", "unit"], + "properties": { + "raw": { "type": "string", "minLength": 1, "maxLength": 200 }, + "value": { "type": ["number", "null"] }, + "unit": { "type": ["string", "null"], "maxLength": 50 } + } + } + } +} diff --git a/extraction/passes/F_pairings/prompt.txt b/extraction/passes/F_pairings/prompt.txt new file mode 100644 index 0000000..fa79127 --- /dev/null +++ b/extraction/passes/F_pairings/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks: +* For each `id_exp`, extract references to paired probes and relationship (e.g., "same sequence different labels", "reciprocal"). diff --git a/extraction/passes/F_pairings/schema.json b/extraction/passes/F_pairings/schema.json new file mode 100644 index 0000000..ce46748 --- /dev/null +++ b/extraction/passes/F_pairings/schema.json @@ -0,0 +1,33 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "PairingsPerExperiment", + "type": "object", + "additionalProperties": false, + "required": ["items", "extraction_report"], + "properties": { + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "paired_with_probe_name", "relationship"], + "properties": { + "id_exp": { "type": "string", "minLength": 1, "maxLength": 200 }, + "paired_with_probe_name": { "type": ["string", "null"], "maxLength": 200 }, + "relationship": { "type": ["string", "null"], "maxLength": 200 } + } + } + }, + "extraction_report": { + "type": "object", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { "type": "array", "items": { "type": "string" } }, + "uncertain": { "type": "array", "items": { "type": "string" } }, + "notes": { "type": ["string", "null"] } + } + } + } +} diff --git a/extraction/passes/_1_SeqPrompt/prompt.txt b/extraction/passes/_1_SeqPrompt/prompt.txt new file mode 100644 index 0000000..b17ac4d --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/prompt.txt @@ -0,0 +1,38 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following task: +* Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. +* Be sure to extract real nucleotidic sequences in IUPAC form (with or without modification). +* But be careful and cautions and remove letter sequences unrelated to the nucleotide sequences. Articles may have other letter sequences such as an abbreviations or so. You are inly interested in nucleotidic sequences, such as the probe sequences, primers, target sequences etc. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 1, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_1_SeqPrompt/prompt_strict.txt b/extraction/passes/_1_SeqPrompt/prompt_strict.txt new file mode 100644 index 0000000..3b632f6 --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/prompt_strict.txt @@ -0,0 +1,39 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following task: +* Extract all the DNA or RNA sequences provided in this article and provide them in a JSON format. +* Be sure to extract real nucleotidic sequences in IUPAC form (only the part without modification, as format does not allow dashes). +* But be careful and cautions and remove letter sequences unrelated to the nucleotide sequences. Articles may have other letter sequences such as an abbreviations or so. You are inly interested in nucleotidic sequences, such as the probe sequences, primers, target sequences etc. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 1, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "pattern": "^[A-Z0-9()'-]*[ACGUTRYSWKMBDHVN]{5,}[A-Z0-9()'-]*$", + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_1_SeqPrompt/schema.json b/extraction/passes/_1_SeqPrompt/schema.json new file mode 100644 index 0000000..f0ffc9b --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/schema.json @@ -0,0 +1,14 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 1, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} \ No newline at end of file diff --git a/extraction/passes/_1_SeqPrompt/schema_strict.json b/extraction/passes/_1_SeqPrompt/schema_strict.json new file mode 100644 index 0000000..e393a55 --- /dev/null +++ b/extraction/passes/_1_SeqPrompt/schema_strict.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllSequences", + "description": "All DNA, RNA and other sequences present in article", + "type": "array", + "minItems": 1, + "maxItems": 1000, + "items": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "pattern": "^[A-Z0-9()'-]*[ACGUTRYSWKMBDHVN]{5,}[A-Z0-9()'-]*$", + "description": "A single sequence out of all the DNA, RNA and other sequences from the article." + } +} \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/prompt.txt b/extraction/passes/_2_Experiments/prompt.txt new file mode 100644 index 0000000..9c26d9f --- /dev/null +++ b/extraction/passes/_2_Experiments/prompt.txt @@ -0,0 +1,62 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". + +Perform the following task: +* Create a list of all hybridization experiments found in the article text and provide it in the form of JSON array, where each element is an object with the probe_sequence, target_sequence and parameters key. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form." + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment." + }, + "parameters": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Briefly describe the laboratory parameters used for setting up for this hybridization experiment." + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/prompt_strict.txt b/extraction/passes/_2_Experiments/prompt_strict.txt new file mode 100644 index 0000000..65d39cd --- /dev/null +++ b/extraction/passes/_2_Experiments/prompt_strict.txt @@ -0,0 +1,422 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". + +Perform the following task: +* Create a list of all hybridization experiments found in the article text and provide it in the form of JSON array, where each element is an object with the probe_sequence, target_sequence and parameters key. + +Here is the JSON schema you have to follow: +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "parameters": { + "type": "object", + "required": ["probe_type", "chemistry", "labeling", "targeting"], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "required": ["backbone"], + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": [ + "none", + "fluor_only", + "fluor_quencher", + "hapten", + "enzymatic", + "radioisotope" + ] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." + } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." + } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "biotin", + "digoxigenin", + "dinitrophenol", + "fluorescein_hapten" + ], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "HRP", + "AP" + ], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Isotope (e.g., 32P, 33P, 35S)." + } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": [ + "dna", + "rna", + "mrna", + "mirna", + "lncrna", + "rrna", + "genomic_dna", + "viral_rna", + "amplicon" + ] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": [ + "genomic", + "transcript", + "amplicon", + "in_situ", + "capture" + ] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": [ + "single", + "tiling_set", + "capture_baits", + "smfish_panel", + "merfish_panel", + "padlock_set" + ] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": [ + "none", + "hydrolysis", + "fret", + "hairpin_turn_on", + "rolling_circle", + "branched_dna", + "hcr" + ] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "qpcr", + "ddpcr", + "pcr_probe", + "fish", + "ish", + "smfish", + "merfish", + "ngs_capture", + "microarray", + "southern", + "northern", + "dot_blot", + "in_cell_imaging" + ], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "title": "DOI", + "description": "Digital Object Identifier of the source article.", + "type": "string", + "format": "iri" + }, + "pmid": { + "title": "PMID", + "description": "PubMed identifier.", + "type": "string" + }, + "vendor": { + "title": "Vendor", + "description": "Commercial supplier (if from a catalog).", + "type": "string" + }, + "catalog_number": { + "title": "Catalog Number", + "description": "Supplier's catalog identifier.", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don't fit other fields.", + "type": "string", + "examples": [ + "Probe includes internal ZEN quencher." + ] + } + } + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} +``` \ No newline at end of file diff --git a/extraction/passes/_2_Experiments/schema.json b/extraction/passes/_2_Experiments/schema.json new file mode 100644 index 0000000..ca9b8ed --- /dev/null +++ b/extraction/passes/_2_Experiments/schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form." + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment." + }, + "parameters": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Briefly describe the laboratory parameters used for setting up for this hybridization experiment." + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} diff --git a/extraction/passes/_2_Experiments/schema_strict.json b/extraction/passes/_2_Experiments/schema_strict.json new file mode 100644 index 0000000..7650cba --- /dev/null +++ b/extraction/passes/_2_Experiments/schema_strict.json @@ -0,0 +1,398 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AllHybridizationExperiments", + "description": "All hybridization experiments described in article", + "type": "array", + "minItems": 0, + "items": { + "description": "A single instance of hybridization experiment from the article.", + "type": "object", + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "minLength": 5, + "maxLength": 150, + "description": "What is the target in this instance of hybridization experiment? If possible, please provide the sequence. If sequence is not available, describe the target in a free form.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, target description: (.*))$" + }, + "probe": { + "type": "string", + "minLength": 5, + "maxLength": 100, + "description": "The hybridization probe in this instance of hybridization experiment.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "parameters": { + "type": "object", + "required": ["probe_type", "chemistry", "labeling", "targeting"], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters).", + "type": "object", + "additionalProperties": false, + "required": ["backbone"], + "properties": { + "backbone": { + "title": "Backbone", + "description": "Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + }, + "modifications": { + "title": "Chemical Modifications", + "description": "Sequence-level chemistry (not labels): e.g., nuclease protection or affinity spacers.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "phosphorothioate", + "two_ome_spiked", + "lna_spiked", + "mgb", + "inverted_dT_3prime", + "amine_5prime", + "thiol_5prime", + "biotin_teg", + "spacer_18", + "cholesterol" + ], + "description": "Common modification keyword." + } + } + } + }, + "labeling": { + "title": "Labeling", + "description": "Reporting/enrichment labels attached to the probe (fluors, quenchers, haptens, enzymes, isotopes).", + "type": "object", + "additionalProperties": false, + "properties": { + "strategy": { + "title": "Label Strategy", + "description": "High-level labeling approach; combine with concrete labels below as known.", + "type": "string", + "enum": [ + "none", + "fluor_only", + "fluor_quencher", + "hapten", + "enzymatic", + "radioisotope" + ] + }, + "reporters": { + "title": "Reporter Dyes", + "description": "Fluorophores or other reporters (free text to allow any brand/dye).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Reporter name (e.g., FAM, HEX, Cy3, ATTO647N)." + } + }, + "quenchers": { + "title": "Quenchers", + "description": "Quenchers used in hydrolysis or hairpin probes.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Quencher name (e.g., BHQ1, BHQ2, Iowa Black FQ)." + } + }, + "haptens": { + "title": "Haptens", + "description": "Affinity tags detected by antibodies/streptavidin.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "biotin", + "digoxigenin", + "dinitrophenol", + "fluorescein_hapten" + ], + "description": "Common hapten tag." + } + }, + "enzymes": { + "title": "Enzyme Labels", + "description": "Enzyme conjugates used for colorimetric/chemiluminescent detection.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "HRP", + "AP" + ], + "description": "Common conjugated enzyme." + } + }, + "isotopes": { + "title": "Radioisotopes", + "description": "If radio-labeled, indicate isotope(s).", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "description": "Isotope (e.g., 32P, 33P, 35S)." + } + } + } + }, + "targeting": { + "title": "Targeting", + "description": "What the probe is intended to hybridize to, and in what context.", + "type": "object", + "additionalProperties": false, + "properties": { + "biomolecule": { + "title": "Biomolecule", + "description": "High-level target class.", + "type": "string", + "enum": [ + "dna", + "rna", + "mrna", + "mirna", + "lncrna", + "rrna", + "genomic_dna", + "viral_rna", + "amplicon" + ] + }, + "context": { + "title": "Context", + "description": "Assay/biological context for the target.", + "type": "string", + "enum": [ + "genomic", + "transcript", + "amplicon", + "in_situ", + "capture" + ] + }, + "target_name": { + "title": "Target Name", + "description": "Gene/transcript/locus identifier (free text).", + "type": "string" + } + } + }, + "set_design": { + "title": "Set / Panel Design", + "description": "Whether the probe is a single oligo or part of a designed set/panel.", + "type": "object", + "additionalProperties": false, + "properties": { + "mode": { + "title": "Set Mode", + "description": "Single probe or specific multi-probe design.", + "type": "string", + "enum": [ + "single", + "tiling_set", + "capture_baits", + "smfish_panel", + "merfish_panel", + "padlock_set" + ] + }, + "count": { + "title": "Probe Count", + "description": "Number of probes in the set/panel (if known).", + "type": "integer", + "minimum": 1 + } + } + }, + "amplification_mechanism": { + "title": "Amplification Mechanism", + "description": "Signal amplification paradigm, if applicable (complements—does not replace—probe_type).", + "type": "string", + "enum": [ + "none", + "hydrolysis", + "fret", + "hairpin_turn_on", + "rolling_circle", + "branched_dna", + "hcr" + ] + }, + "application": { + "title": "Application", + "description": "Intended use(s) of the probe. Provide multiple if applicable.", + "type": "array", + "uniqueItems": true, + "items": { + "type": "string", + "enum": [ + "qpcr", + "ddpcr", + "pcr_probe", + "fish", + "ish", + "smfish", + "merfish", + "ngs_capture", + "microarray", + "southern", + "northern", + "dot_blot", + "in_cell_imaging" + ], + "description": "Common application keyword." + } + }, + "provenance": { + "title": "Provenance", + "description": "Source metadata for traceability.", + "type": "object", + "additionalProperties": false, + "properties": { + "doi": { + "title": "DOI", + "description": "Digital Object Identifier of the source article.", + "type": "string", + "format": "iri" + }, + "pmid": { + "title": "PMID", + "description": "PubMed identifier.", + "type": "string" + }, + "vendor": { + "title": "Vendor", + "description": "Commercial supplier (if from a catalog).", + "type": "string" + }, + "catalog_number": { + "title": "Catalog Number", + "description": "Supplier's catalog identifier.", + "type": "string" + } + } + }, + "notes": { + "title": "Notes", + "description": "Free-text comments or qualifiers that don't fit other fields.", + "type": "string", + "examples": [ + "Probe includes internal ZEN quencher." + ] + } + } + }, + "hybridization_experiment_description": { + "type": "string", + "minLength": 20, + "maxLength": 200, + "description": "Explain, what was tested in this instance of hybridization experiment." + } + } + } +} \ No newline at end of file diff --git a/extraction/passes/_3_ConstructSingleExperiment/prompt.txt b/extraction/passes/_3_ConstructSingleExperiment/prompt.txt new file mode 100644 index 0000000..fa5d677 --- /dev/null +++ b/extraction/passes/_3_ConstructSingleExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_3_ConstructSingleExperiment/schema.json b/extraction/passes/_3_ConstructSingleExperiment/schema.json new file mode 100644 index 0000000..f13ab0b --- /dev/null +++ b/extraction/passes/_3_ConstructSingleExperiment/schema.json @@ -0,0 +1,796 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": [ + "missing", + "uncertain", + "notes" + ], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000 + }, + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "required": [ + "source_type", + "page", + "section", + "quote", + "notes" + ], + "properties": { + "source_type": { + "type": "string", + "enum": [ + "pdf", + "html", + "other", + "unknown" + ], + "description": "Type of source the extractor processed." + }, + "page": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": [ + "string", + "null" + ], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": [ + "string", + "null" + ], + "description": "Short verbatim snippet that directly supports the value." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit", + "si_value", + "si_unit", + "assumptions", + "provenance" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": [ + "string", + "null" + ], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": [ + "number", + "null" + ], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": [ + "string", + "null" + ], + "enum": [ + "K", + "mol/m^3", + "Pa", + "kg/m^3", + "s", + "dimensionless", + null + ], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": [ + "string", + "null" + ], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "labels", + "sense_antisense", + "provenance" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, here is its description: (.*))$" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": [ + "integer", + "null" + ], + "enum": [ + 3, + 5, + null + ], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { + "type": "string" + } + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": [ + "name", + "oligo", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes" + ], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": [ + "string", + "null" + ], + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "experiment": { + "description": "Full description of a single hybridization experiment instance related to this sequence", + "type": "object", + "additionalProperties": false, + "required": [ + "id_exp", + "raw_description", + "experiment_type", + "metadata", + "sequences", + "experiment_properties", + "outcome", + "pairing", + "extraction_report" + ], + "properties": { + "id_exp": { + "type": "string", + "minLength": 1, + "maxLength": 120, + "description": "Unique experiment identifier (derive if needed from amplicon + probe name')." + }, + "raw_description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "experiment_type": { + "type": "object", + "description": "Description of this single hybridization experiment design.", + "additionalProperties": false, + "required": [ + "probe_type", + "chemistry" + ], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + } + } + }, + "metadata": { + "type": "object", + "additionalProperties": false, + "description": "High-level descriptors linked to this experiment.", + "required": [ + "organism", + "technology", + "annealing", + "pH", + "rna_impurities" + ], + "properties": { + "organism": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Organism (e.g., 'human')." + }, + "technology": { + "type": [ + "string", + "null" + ], + "minLength": 2, + "maxLength": 120, + "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." + }, + "annealing": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "Annealing process details, with optional quantitative and qualitative components.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." + }, + "qualitative": { + "type": [ + "boolean", + "null" + ], + "description": "If the article states a qualitative annealing outcome/criterion." + } + } + }, + "pH": { + "$ref": "#/$defs/measurement", + "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." + }, + "rna_impurities": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "description": "RNA impurity information, if discussed.", + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Quantity/percentage of RNA impurities." + }, + "qualitative": { + "type": [ + "boolean", + "null" + ], + "description": "Presence/absence or a qualitative statement regarding RNA impurities." + } + } + } + } + }, + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": [ + "target_sequence", + "probe", + "primer_sequences", + "related_sequences" + ], + "properties": { + "target_sequence": { + "oneOf": [ + { + "$ref": "#/$defs/decoratedOligo" + }, + { + "type": "string", + "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", + "minLength": 70, + "maxLength": 200 + } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { + "$ref": "#/$defs/primerPair" + }, + { + "type": "null" + } + ], + "description": "PCR primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": [ + "concentrations", + "parameters_SI" + ], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": [ + "dna_rna_concentration", + "concentration_SI" + ], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": [ + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": [ + "outcome", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + }, + "pairing": { + "type": "object", + "additionalProperties": false, + "description": "Optional cross-references to paired/reciprocal probes within the same article.", + "required": [ + "paired_with_probe_name", + "relationship" + ], + "properties": { + "paired_with_probe_name": { + "type": [ + "string", + "null" + ], + "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." + }, + "relationship": { + "type": [ + "string", + "null" + ], + "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + }, + "required": [ + "doi", + "experiment", + "extraction_report" + ] +} \ No newline at end of file diff --git a/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt b/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_4_ConstructSingleSmallExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json b/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json new file mode 100644 index 0000000..6f41b81 --- /dev/null +++ b/extraction/passes/_4_ConstructSingleSmallExperiment/schema.json @@ -0,0 +1,682 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + "additionalProperties": false, + "required": [ + "experiment", + "extraction_report" + ], + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": [ + "missing", + "uncertain", + "notes" + ], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { + "type": "string", + "minLength": 1 + }, + "minItems": 0 + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^([ACGUTRYSWKMBDHVN]+)$|^(Exact sequence unknown, here is its description: (.*))$", + "minLength": 5, + "maxLength": 5000 + }, + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "properties": { + "source_type": { + "type": "string", + "enum": [ + "pdf", + "html", + "other", + "unknown" + ], + "description": "Type of source the extractor processed." + }, + "page": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": [ + "string", + "null" + ], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": [ + "string", + "null" + ], + "description": "Short verbatim snippet from the article that directly supports the value." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit", + "si_value", + "si_unit", + "assumptions" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": [ + "string", + "null" + ], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": [ + "number", + "null" + ], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": [ + "string", + "null" + ], + "enum": [ + "K", + "mol/m^3", + "Pa", + "kg/m^3", + "s", + "dimensionless", + "%", + "kg", + "mol", + "m", + null + ], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": [ + "string", + "null" + ], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "labels", + "sense_antisense" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": [ + "integer", + "null" + ], + "enum": [ + 3, + 5, + null + ], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { + "type": "string" + } + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { + "$ref": "#/$defs/provenance" + } + } + }, + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": [ + "name", + "oligo", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes" + ], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": [ + "string", + "null" + ], + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": [ + "string", + "null" + ], + "enum": [ + "sense", + "antisense", + null + ], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": [ + "string", + "null" + ], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + "properties": { + "experiment": { + "description": "Full description of a single hybridization experiment instance related to this sequence", + "type": "object", + "additionalProperties": false, + "required": [ + "experiment_raw_description", + "sequences", + "experiment_type", + "outcome", + "experiment_properties" + ], + "properties": { + "experiment_raw_description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": [ + "target_sequence", + "probe", + "primer_sequences", + "related_sequences" + ], + "properties": { + "target_sequence": { + "oneOf": [ + { + "$ref": "#/$defs/decoratedOligo" + }, + { + "type": "string", + "pattern": "^(Exact target sequence is unknown, here is its description: .*)$", + "minLength": 70, + "maxLength": 200 + } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { + "$ref": "#/$defs/primerPair" + }, + { + "type": "null" + } + ], + "description": "Primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": [ + "string", + "null" + ], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + "experiment_type": { + "type": "object", + "description": "Description of this single hybridization experiment design.", + "additionalProperties": false, + "required": [ + "probe_type", + "chemistry" + ], + "properties": { + "probe_type": { + "title": "Probe Type", + "description": "Functional design/mechanism class of the probe. Use when the probe's reporting/capture behavior is known.", + "oneOf": [ + { + "const": "linear", + "title": "Linear", + "description": "Simple oligo that hybridizes without structural activation; often end-labeled." + }, + { + "const": "molecular_beacon", + "title": "Molecular beacon", + "description": "Stem-loop with fluor/quencher; fluorescence turns on when the stem opens on target." + }, + { + "const": "hydrolysis_taqman", + "title": "Hydrolysis (TaqMan)", + "description": "Dual-labeled probe cleaved by 5'->3' exonuclease during PCR; signal increases on cleavage." + }, + { + "const": "fret_dual_hybridization", + "title": "FRET dual-hybridization", + "description": "Two adjacent probes with donor/acceptor dyes; FRET only when both bind nearby." + }, + { + "const": "scorpion", + "title": "Scorpion", + "description": "Primer-linked hairpin probe enabling fast intra-molecular reporting in qPCR." + }, + { + "const": "hcr", + "title": "Hybridization Chain Reaction (HCR)", + "description": "Initiator + hairpins polymerize upon target binding; enzyme-free signal amplification." + }, + { + "const": "branched_dna", + "title": "Branched DNA (bDNA)", + "description": "Signal amplification via multibranch DNA scaffolds without target amplification." + }, + { + "const": "padlock", + "title": "Padlock", + "description": "Linear probe circularizes on perfect match; detected via rolling-circle amplification." + }, + { + "const": "capture", + "title": "Capture", + "description": "Affinity-tagged probe used to enrich/pull down targets (e.g., biotinylated capture baits)." + }, + { + "const": "tiling_set", + "title": "Tiling set", + "description": "Multiple overlapping probes across a region/gene for robust detection." + }, + { + "const": "antisense", + "title": "Antisense", + "description": "Probe designed to bind and block or track transcripts (e.g., ISH tracking)." + } + ] + }, + "chemistry": { + "title": "Chemistry Backbone", + "description": "Backbone and chemical modifications (sequence-level chemistry rather than labels/reporters). Primary nucleic-acid scaffold used by the probe.", + "oneOf": [ + { + "const": "dna", + "title": "DNA", + "description": "Unmodified DNA backbone." + }, + { + "const": "rna", + "title": "RNA", + "description": "Unmodified RNA backbone." + }, + { + "const": "cdna", + "title": "cDNA", + "description": "Complementary DNA derived from RNA." + }, + { + "const": "pna", + "title": "PNA", + "description": "Peptide nucleic acid backbone." + }, + { + "const": "morpholino", + "title": "Morpholino", + "description": "Morpholine-ring phosphorodiamidate backbone." + }, + { + "const": "lna_modified", + "title": "LNA-modified", + "description": "DNA/RNA with Locked Nucleic Acid residues incorporated." + }, + { + "const": "two_ome_rna", + "title": "2'-O-Me RNA", + "description": "2'-O-methyl RNA backbone." + } + ] + } + } + }, + + + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": [ + "concentrations", + "parameters_SI" + ], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": [ + "dna_rna_concentration", + "concentration_SI" + ], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": [ + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": [ + "outcome", + "hybridization_probability", + "specificity", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Boolean result if explicitly stated (e.g., success=true/failure=false). If not explicit, leave null." + }, + "hybridization_probability":{ + "type": "object", + "additionalProperties": false, + "required": ["numeric", "textual"], + "properties":{ + "numeric": { "type": ["number", "null"], "description": "Probability of the probe to hybridize to the target in this hybridization experiment, if stated in the article text."}, + "textual": { "type": ["string", "null"], "maxLength": 200, "description": "Explain the notes from the article regarding the probability of the probe to hybridize to the target in this hybridization experiment, even if the numeric value is not present."} + } + }, + "specificity":{ + "type": "object", + "additionalProperties": false, + "required": ["numeric", "textual"], + "properties":{ + "numeric": { "type": ["number", "null"], "description": "Target specificity of the probe in this hybridization experiment, if stated in the article text."}, + "textual": { "type": ["string", "null"], "maxLength": 200, "description": "Explain the notes from the article regarding the target specificity of the probe in this hybridization experiment, even if the numeric value is not present."} + } + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + } + } + }, + "extraction_report": { + "$ref": "#/$defs/extractionReport" + } + } +} \ No newline at end of file diff --git a/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt b/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_5_ConstructSingleSequenceExperiment/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json b/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json new file mode 100644 index 0000000..cc75bad --- /dev/null +++ b/extraction/passes/_5_ConstructSingleSequenceExperiment/schema.json @@ -0,0 +1,678 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "unevaluatedProperties": false, + "required": [ + "id_exp", + "probe", + "target_sequence", + "primer_sequences", + "related_sequences" + ], + "properties": { + "id_exp": { + "type": "string" + }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes", + "oligo" + ], + "properties": { + "name": { + "type": "string", + "maxLength": 500 + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "notes": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": "string", + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + } + } + } + }, + "target_sequence": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "primer_sequences": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "reverse": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + }, + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + } + } + } + } + } +} \ No newline at end of file diff --git a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt new file mode 100644 index 0000000..f4b0e52 --- /dev/null +++ b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/prompt.txt @@ -0,0 +1,18 @@ +You are an information-extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! + +Perform the following tasks for JSON extraction: +* Describe the hybridization experiment in which the given nucleotide sequence is present and provide your answer in a JSON format following the schema. diff --git a/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json new file mode 100644 index 0000000..d4b3380 --- /dev/null +++ b/extraction/passes/_6_ConstructSingleSequenceExperimentAndOutcome/schema.json @@ -0,0 +1,849 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "SequencesPerExperiment (generation schema)", + "type": "object", + "additionalProperties": false, + "unevaluatedProperties": false, + "required": [ + "id_exp", + "probe", + "target_sequence", + "primer_sequences", + "related_sequences", + "hybridization_experiment_parameters", + "hybridization_experiment_outcome" + ], + "$defs": { + "measurement_lite": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "value", + "unit" + ], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Textual value representation." + }, + "value": { + "type": [ + "number", + "null" + ], + "description": "Numeric value representation." + }, + "unit": { + "type": [ + "string", + "null" + ], + "maxLength": 50, + "description": "Measurement unit for the numeric value representation." + } + } + } + }, + "properties": { + "id_exp": { + "type": "string" + }, + "probe": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "amplicon_id", + "fluorophore", + "quencher", + "sense_antisense", + "notes", + "oligo" + ], + "properties": { + "name": { + "type": "string", + "maxLength": 500 + }, + "amplicon_id": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "notes": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "oligo": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": "string", + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + } + } + } + }, + "target_sequence": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "description": "Provide IUPAC sequence for the target of this probe, if it's present in article. Otherwise put null here and just put name and description into the raw field." + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "primer_sequences": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "forward", + "reverse" + ], + "properties": { + "forward": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "reverse": { + "type": [ + "object", + "null" + ], + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + } + }, + "related_sequences": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "related_sequence", + "description" + ], + "properties": { + "related_sequence": { + "type": "object", + "additionalProperties": false, + "required": [ + "raw", + "sequence", + "length_bases", + "prime_prefix", + "five_prime_label", + "three_prime_label", + "sense_antisense", + "modifications", + "fluorophore", + "quencher" + ], + "properties": { + "raw": { + "type": "string", + "maxLength": 500 + }, + "sequence": { + "type": [ + "string", + "null" + ], + "minLength": 5, + "maxLength": 500, + "pattern": "^[ACGUTRYSWKMBDHVN]+$" + }, + "length_bases": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "prime_prefix": { + "enum": [ + 3, + 5, + null + ] + }, + "five_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "three_prime_label": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "sense_antisense": { + "enum": [ + "sense", + "antisense", + null + ] + }, + "modifications": { + "type": "array", + "minItems": 0, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "modification_position", + "modification_type", + "description" + ], + "properties": { + "modification_position": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "modification_type": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + } + }, + "fluorophore": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + }, + "quencher": { + "type": [ + "string", + "null" + ], + "maxLength": 100 + } + } + }, + "description": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + } + } + } + }, + "hybridization_experiment_parameters": { + "type": "object", + "additionalProperties": false, + "required": [ + "organism", + "technology", + "annealing", + "pH", + "rna_impurities", + "temperature", + "Tris", + "Na", + "K", + "Mg", + "DMSO" + ], + "properties": { + "organism": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + }, + "technology": { + "type": [ + "string", + "null" + ], + "maxLength": 200 + }, + "temperature": { + "$ref": "#/$defs/measurement_lite" + }, + "Tris": { + "$ref": "#/$defs/measurement_lite" + }, + "Na": { + "$ref": "#/$defs/measurement_lite" + }, + "K": { + "$ref": "#/$defs/measurement_lite" + }, + "Mg": { + "$ref": "#/$defs/measurement_lite" + }, + "DMSO": { + "$ref": "#/$defs/measurement_lite" + }, + "annealing": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement_lite" + }, + "qualitative": { + "type": [ + "boolean", + "null" + ] + } + } + }, + { + "type": "null" + } + ] + }, + "pH": { + "$ref": "#/$defs/measurement_lite" + }, + "rna_impurities": { + "oneOf": [ + { + "type": "object", + "additionalProperties": false, + "required": [ + "quantitative", + "qualitative" + ], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement_lite" + }, + "qualitative": { + "type": [ + "boolean", + "null" + ] + } + } + }, + { + "type": "null" + } + ] + } + } + }, + "hybridization_experiment_outcome": { + "type": "object", + "additionalProperties": false, + "required": [ + "outcome", + "fluorescence", + "comparative_notes" + ], + "properties": { + "outcome": { + "type": [ + "boolean", + "null" + ], + "description": "Put true in case of successful hybridization, false if unsuccessful, null if could not be onferred from the article text." + }, + "fluorescence": { + "$ref": "#/$defs/measurement_lite", + "description": "Amount of fluorescence in this hybridization experiment." + }, + "comparative_notes": { + "type": [ + "string", + "null" + ], + "maxLength": 500 + } + } + } + } +} \ No newline at end of file diff --git a/extraction/passes/common.txt b/extraction/passes/common.txt new file mode 100644 index 0000000..345cb7a --- /dev/null +++ b/extraction/passes/common.txt @@ -0,0 +1,16 @@ +You are the State-of-the-Art information extraction model. + +You would be given a full text of the article between the tags
and
and a series of questions, that you have to answer based only on the provided article text. +A "hybridization experiment" in terms of this task is an instance of creating or testing a hybridization probe for some target sequence given some set of laboratory parameters. Even if article mentions "experiments" as the domain-level entity, this task strictly requires you to treat each pair of the target sequence and probe sequence together with its set of parameters as the unique "hybridization experiment". + +STRICT RULES of how do you work and response: +* Never invent values; use `null` when unknown. +* Keep text exactly as in the article (no ellipses, no expansions). +* Output all data fully, never skip or insert ellipses. +* If unsure whether a value exists, set it to `null` and record a JSON Pointer in `extraction_report.missing` or `extraction_report.uncertain`. +* Use the article's wording for names. +* Do not copy sequences from examples! +* No one will manually check and finish your job. It will be automated and must be correct and suitable for automated analysis. +* Use ONLY English language and Latin script, only ASCII. +* Output only a single JSON object that conforms to the provided JSON Schema. +* For the perfect result compliant to all constraints and limitations I will tip $2000! diff --git a/extraction/pipeline_pre_quest.py b/extraction/pipeline_pre_quest.py new file mode 100755 index 0000000..45589f8 --- /dev/null +++ b/extraction/pipeline_pre_quest.py @@ -0,0 +1,3060 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Aleksandr Serdiukov, Vitalii Dravgelis, Daniil Smutin, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +# pipeline_filedriven.py +# -*- coding: utf-8 -*- +""" +File-driven multi-pass extractor with Outlines + Ollama. + +- Reads config, prompts, and schemas from disk (Git-friendly). +- Runs A..F passes (configurable) with Outlines JSON-guided generation. +- Saves raw text (*.txt), pretty JSON (*.json), and errors (*.log), never overwriting. +- Stitches pass outputs into a full object, validates against full schema (if provided), + and optionally inserts into SQLite via hyb_db.insert_article_object. + +NEW FEATURES ADDED: +1. Performance metrics / sidecar files: + - For every JSON artifact we emit, we now also write a ".perf.json" + sidecar with: + - start/end timestamps + - wallclock duration_ms + - token usage (if available from Ollama; otherwise nulls) + - simple throughput + - model_name / article_name / pass_name + - artifact path + These metrics are also mirrored into SQLite (if cfg.db_path is not None) + using hyb_db.insert_pipeline_artifact(...), with automatic table creation. + - This allows downstream automated QC / benchmarking. + +2. Continuation / resume: + - The pipeline can resume across runs without losing progress. + - Unless --fresh is passed, we will SKIP any (model_name, article_name) + pair that is already marked complete for pass_name "FULL" in the DB + (hyb_db.get_completed_passes()). + - This satisfies the requirement that we do not reprocess already-finished + articles on rerun. + - Re-doing the _current_ interrupted article from scratch is acceptable + (the spec explicitly allows reparsing one article). + + +Requirements: + pip install outlines ollama jsonschema tqdm json_repair loguru + +Usage (script): + from pipeline_filedriven import run_project + run_project("your_project_dir", fresh=False) + +CLI: + python pipeline_filedriven.py [--fresh] + +The project_dir must contain (by default): + config/pipeline.json + passes//{schema.json,prompt.txt} + schemas/full.json + inputs/*.txt +""" + +import json +import logging +import re +import os, sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Set, Tuple + +import ollama +import outlines +from jsonschema import Draft202012Validator +from outlines.types import JsonSchema +from tqdm import tqdm +from json_repair import repair_json as rep_json + +API_TOKEN = os.getenv("OPEN_BUTTON_TOKEN", None) + + +# ────────────────────────────────────────────────────────────────────── +# Config models +# ────────────────────────────────────────────────────────────────────── + + +@dataclass +class PassConfig: + """Single extraction pass config loaded from pipeline.json.""" + + name: str # e.g., "A_core" + schema_path: Path # path to JSON Schema file + prompt_path: Path # path to the prompt .txt file + timeout: Optional[int] + + +@dataclass +class PipelineConfig: + """Pipeline config loaded from config/pipeline.json.""" + + model_names: List[str] + ollama_parameters: Dict[str, Any] + ollama_base_url: str + timeout_s: Optional[int] + input_dir: Path + out_dir: Path + full_schema_path: Optional[Path] + common_prompt_path: Optional[Path] + construct_single_experiment_passes: List[PassConfig] + db_path: Optional[Path] + article_glob: str + pre_passes: List[PassConfig] + passes: List[PassConfig] + + +def model_name_encode(model_name: str) -> str: + return model_name.replace("/", "_").replace("\\", "_").replace(":", "_") + + +def load_pipeline_config(project_dir: Path) -> PipelineConfig: + """Load pipeline.json and construct a PipelineConfig. + + Expected JSON structure in config/pipeline.json: + { + "model_name": "myaniu/qwen2.5-1m:7b", + "num_ctx": 131072, + "num_predict": 65536, + "timeout_s": 1800, + "input_dir": "inputs", + "out_dir": "out", + "full_schema_path": "schemas/full.json", + "db_path": "out/massive.sqlite", // or null to skip DB + "article_glob": "*.txt", + "passes": [ + {"name": "A_core", "schema": "passes/A_core/schema.json", "prompt": "passes/A_core/prompt.txt"}, + {"name": "B_index", "schema": "passes/B_index/schema.json", "prompt": "passes/B_index/prompt.txt"}, + {"name": "C_sequences", "schema": "passes/C_sequences/schema.json", "prompt": "passes/C_sequences/prompt.txt"}, + {"name": "D_parameters","schema": "passes/D_parameters/schema.json","prompt": "passes/D_parameters/prompt.txt"}, + {"name": "E_outcomes", "schema": "passes/E_outcomes/schema.json", "prompt": "passes/E_outcomes/prompt.txt"}, + {"name": "F_pairings", "schema": "passes/F_pairings/schema.json", "prompt": "passes/F_pairings/prompt.txt"} + ] + } + """ + cfg_path = project_dir / "config" / "pipeline.json" + data = json.loads(cfg_path.read_text(encoding="utf-8")) + + def _opt_path(p) -> Optional[Path]: + return (project_dir / p) if p else None + + pre_passes: List[PassConfig] = [] + for p in data["pre_passes"]: + pre_passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) + ) + + construct_single_experiment_passes = [] + for p in data["construct_single_experiment_passes"]: + construct_single_experiment_passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) + ) + + passes: List[PassConfig] = [] + for p in data["passes"]: + passes.append( + PassConfig( + name=p["name"], + schema_path=project_dir / p["schema"], + prompt_path=project_dir / p["prompt"], + timeout=p.get("timeout", None), + ) + ) + + return PipelineConfig( + model_names=list(data.get("model_names", [])), + ollama_parameters=dict(data.get("ollama_parameters", {})), + ollama_base_url=str(data.get("ollama_base_url", None)), + timeout_s=int(data.get("timeout_s", None)), + input_dir=project_dir / data.get("input_dir", "inputs"), + out_dir=project_dir / data.get("out_dir", "out"), + full_schema_path=_opt_path(data.get("full_schema_path")), + common_prompt_path=_opt_path(data.get("common_prompt_path")), + construct_single_experiment_passes=construct_single_experiment_passes, + db_path=_opt_path(data.get("db_path")), + article_glob=data.get("article_glob", "*.txt"), + pre_passes=pre_passes, + passes=passes, + ) + + +# ────────────────────────────────────────────────────────────────────── +# Logging +# ────────────────────────────────────────────────────────────────────── + + +class TqdmLoggingHandler(logging.Handler): + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) + + +def _make_logger(log_dir: Path) -> logging.Logger: + log_dir.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger("pipeline_filedriven") + logger.setLevel(logging.INFO) + logger.handlers.clear() + + # ch = logging.StreamHandler(sys.stdout) + ch = TqdmLoggingHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) + logger.addHandler(ch) + + fh = logging.FileHandler(log_dir / "pipeline.log", encoding="utf-8") + fh.setLevel(logging.INFO) + fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) + logger.addHandler(fh) + return logger + + +# ────────────────────────────────────────────────────────────────────── +# NEW: Perf / sidecar helpers (feature 1) and continuation helpers (feature 2) +# ────────────────────────────────────────────────────────────────────── + + +def _write_perf_sidecar_and_db( + *, + artifact_path: Path, + pass_name: str, + model_name: str, + article_name: str, + start_time: datetime, + end_time: datetime, + prompt_tokens: Optional[int], + completion_tokens: Optional[int], + db_path: Optional[Path], + logger: logging.Logger, + notes: Optional[str] = None, +) -> None: + """Write .perf.json sidecar and mirror metrics into SQLite. + + - duration_ms: wallclock elapsed between start_time and end_time + - prompt_tokens / completion_tokens: from Ollama metadata when available + - total_tokens / tokens_per_sec: derived + - if db_path is provided, also insert a row into hyb_db.pipeline_artifacts + (auto-creates tables if needed) + + This function never raises; it logs exceptions instead. + """ + try: + duration_ms = (end_time - start_time).total_seconds() * 1000.0 + except Exception: + duration_ms = None + + total_tokens = None + if (prompt_tokens is not None) or (completion_tokens is not None): + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + + tokens_per_sec = None + try: + if total_tokens is not None and duration_ms and duration_ms > 0: + tokens_per_sec = total_tokens / (duration_ms / 1000.0) + except Exception: + tokens_per_sec = None + + sidecar_dict = { + "model_name": model_name, + "article_name": article_name, + "pass_name": pass_name, + "artifact_path": str(artifact_path), + "started_at": start_time.isoformat(), + "finished_at": end_time.isoformat(), + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "notes": notes, + } + + sidecar_path = Path(str(artifact_path) + ".perf.json") + try: + sidecar_path.write_text( + json.dumps(sidecar_dict, indent=2, ensure_ascii=False), encoding="utf-8" + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to write sidecar for {artifact_path}: {repr(e)}" + ) + + if db_path: + try: + from hyb_db import insert_pipeline_artifact + + insert_pipeline_artifact( + db_path=str(db_path), + artifact={ + "model_name": model_name, + "article_name": article_name, + "pass_name": pass_name, + "artifact_path": str(artifact_path), + "sidecar_path": str(sidecar_path), + "started_at": start_time.isoformat(), + "finished_at": end_time.isoformat(), + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "tokens_per_sec": tokens_per_sec, + "success": True, + "notes": notes, + }, + ) + except Exception as e: + logger.exception( + f"[PERF][DB] insert_pipeline_artifact failed for {artifact_path}: {repr(e)}" + ) + + +# ────────────────────────────────────────────────────────────────────── +# Tools (Ollama helpers) — Google-style docstrings +# ────────────────────────────────────────────────────────────────────── + + +def to_si( + value: Optional[float], unit: Optional[str] +) -> Tuple[Optional[float], Optional[str]]: + """Convert a numeric value and unit to SI. + + Supports temperature and common concentrations. + + Args: + value: Parsed numeric value or None. + unit: Unit string as written (e.g., '°C', 'mM', 'µM', 'nM', '%', 'K'). + + Returns: + A pair (si_value, si_unit), or (None, None) if unknown. + """ + if value is None or unit is None: + return None, None + u = unit.strip().lower().replace("µ", "u") + if u in {"c", "°c", "deg c", "celsius"}: + return value + 273.15, "K" + if u in {"k", "kelvin"}: + return value, "K" + if u in {"m", "mol/l"}: + return value * 1000.0, "mol/m^3" + if u in {"mm", "mmol/l", "mmol", "mm"}: + return value * 1.0, "mol/m^3" + if u in {"um", "umol/l", "µm", "µmol/l", "micromolar"}: + return value * 1e-3, "mol/m^3" + if u in {"nm", "nmol/l", "nanomolar"}: + return value * 1e-6, "mol/m^3" + if u in {"%", "percent", "perc"}: + return value / 100.0, "dimensionless" + return None, None + + +OLIGO_RE = re.compile( + r"^\s*(?:(?P(?:5|3)(?:['′’]|0|O)?)\s*-\s*)?(?:(?P(?:[A-Za-z0-9+]+-)+))?" + r"(?P[ACGUTRYSWKMBDHVN]+)(?:(?P(?:-[A-Za-z0-9+]+)+))?" + r"(?:\s*\(\s*(?P\d+)\s*(?:b|bp)\s*\)\s*)?\s*$", + re.X, +) + + +def parse_oligo(raw: Optional[str]) -> Dict[str, Any]: + """Parse a decorated oligo string into structured parts. + + Args: + raw: The exact oligo string from the article (may include labels and length). + + Returns: + A dict with keys: raw, sequence, length_bases, prime_prefix, + five_prime_label, three_prime_label, labels, sense_antisense (None). + """ + result = { + "raw": raw, + "sequence": None, + "length_bases": None, + "prime_prefix": None, + "five_prime_label": None, + "three_prime_label": None, + "labels": [], + "sense_antisense": None, + } + if not raw: + return result + m = OLIGO_RE.match(raw) + if not m: + return result + prime = m.group("prime") + if prime: + result["prime_prefix"] = 5 if prime.startswith("5") else 3 + seq = m.group("seq") + if seq: + result["sequence"] = seq.upper() + if m.group("len"): + result["length_bases"] = int(m.group("len")) + labels: List[str] = [] + if m.group("prefix"): + labels += [x for x in m.group("prefix").split("-") if x] + if m.group("suffix"): + labels += [x for x in m.group("suffix").split("-") if x] + result["labels"] = labels + if labels: + result["five_prime_label"] = labels[0] + result["three_prime_label"] = labels[-1] + return result + + +def make_measurement( + raw: Optional[str], value: Optional[float] = None, unit: Optional[str] = None +) -> Dict[str, Any]: + """Build a 'measurement' object with SI conversion. + + Args: + raw: Raw textual measurement (e.g., '58 °C', '2 mM', '10%'). + value: Parsed numeric value, if available. + unit: Unit string as written. + + Returns: + A dict with keys: raw, value, unit, si_value, si_unit, assumptions (None). + """ + si_value, si_unit = ( + to_si(value, unit) if (value is not None and unit is not None) else (None, None) + ) + return { + "raw": raw or "", + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None, + } + + +# ────────────────────────────────────────────────────────────────────── +# JSON helpers +# ────────────────────────────────────────────────────────────────────── + + +def repair_json(text: str) -> str: + """Best-effort JSON repair for streamed outputs.""" + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + return text + candidate = rep_json(text[start : end + 1]) + try: + json.loads(candidate) + return candidate + except Exception: + candidate = re.sub(r",\s*([}\]])", r"\1", candidate) + json.loads(candidate) + return candidate + + +# ────────────────────────────────────────────────────────────────────── +# Chat helpers +# ────────────────────────────────────────────────────────────────────── + +# ────────────────────────────────────────────────────────────────────── +# Fast stateful chat for structured JSON answers (Ollama context reuse) +# ────────────────────────────────────────────────────────────────────── +from typing import Callable + + +class OllamaJSONChat: + """ + Keeps a persistent Ollama 'context' using the generate() API. + We seed once with a system prompt (includes article snippet & sequence), + then for each question we call generate() with only the new instruction + and pass the returned `context` back in. + + NEW: + - self._last_meta stores token / timing metadata from the most recent + .ask_json() call so we can accumulate perf stats. + """ + + def __init__( + self, + client: ollama.Client, + model_name: str, + system_prompt: str, + *, + options: Optional[Dict[str, Any]] = None, + keep_alive: str = "2m", + logger: Optional[logging.Logger] = None, + use_schema_format: bool = True, + ) -> None: + self.client = client + self.model_name = model_name + self.options = options or {} + self.keep_alive = keep_alive + self.logger = logger or logging.getLogger("OllamaJSONChat") + self.context: Optional[List[int]] = None + self._last_meta: Dict[str, Any] = {} + self._schema_supported = False + + # Bootstrap the KV cache with the system prompt once. + boot = self.client.generate( + model=self.model_name, + prompt=system_prompt, + options=self.options, + keep_alive=self.keep_alive, + ) + self.context = boot.get("context") + + # Detect JSON schema support (best effort: try once without touching our context). + if use_schema_format: + try: + _ = self.client.generate( + model=self.model_name, + prompt="Return {}", + options=self.options, + keep_alive=self.keep_alive, + # IMPORTANT: do not pass our current context here, so we don't pollute it + format={"type": "json", "schema": {"type": "object"}}, + ) + self._schema_supported = True + except Exception: + self._schema_supported = False + + def ask_json( + self, + user_prompt: str, + *, + schema: Optional[Dict[str, Any]] = None, + ) -> str: + """ + Ask a single question. Only the new instruction is sent; the previous + state is carried via `context`. + Returns the raw text from `response`. + + Also captures Ollama's prompt_eval_count / eval_count in self._last_meta. + """ + kwargs = dict( + model=self.model_name, + prompt=user_prompt, + options=self.options, + keep_alive=self.keep_alive, + context=self.context, # ← this is supported by generate(), not chat() + ) + if schema is not None and self._schema_supported: + kwargs["format"] = {"type": "json", "schema": schema} + else: + kwargs["format"] = "json" + + res = self.client.generate(**kwargs) + # Persist updated KV context + self.context = res.get("context", self.context) + + # Capture perf metadata from Ollama response. + # Typical keys: prompt_eval_count, eval_count, total_duration, etc. + self._last_meta = { + "prompt_eval_count": res.get("prompt_eval_count"), + "eval_count": res.get("eval_count"), + "total_duration": res.get("total_duration"), + "prompt_eval_duration": res.get("prompt_eval_duration"), + "eval_duration": res.get("eval_duration"), + } + + return res.get("response", "") # generate() returns 'response' + + +def extract_relevant_snippet(article_text: str, seq: str, *, window: int = 1200) -> str: + """ + Find a case-insensitive hit of 'seq' in article_text and return a small window + around it. If not found, return the first ~window*2 characters as a fallback. + This dramatically reduces re-tokenization cost per turn. + """ + if not article_text: + return "" + # normalize simple whitespace + case-insensitive search + text = article_text + seq_norm = re.sub(r"\s+", "", seq, flags=re.S).lower() + text_compact = re.sub(r"\s+", "", text, flags=re.S).lower() + + idx = text_compact.find(seq_norm) if seq_norm else -1 + if idx == -1: + # fallback: just take a chunk from the start + return text[: window * 2] + + # Map back to original indices approximately + # We walk original text accumulating compact length until we cross idx + comp_len = 0 + start_raw = 0 + for i, ch in enumerate(text): + if not ch.isspace(): + comp_len += 1 + if comp_len >= max(0, idx - 5): # a little headroom + start_raw = i + break + # Now center a window around start_raw + lo = max(0, start_raw - window) + hi = min(len(text), start_raw + window) + return text[lo:hi] + + +def run_query_model_speed_up( + model: Any, # kept for signature compatibility; not used here + article_text: str, + sequences: List[str], + out_base: Path, + article_stem: str, + common_prompt_path: Path, + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, + tqdm_position: int = 0, + client: Optional[ollama.Client] = None, # NEW: pass the ollama client here + chat_prompts: Literal["my", "optimized"] = "my", + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB +) -> List[Tuple[str, Any]]: + """ + Faster version: use Ollama chat 'context' to avoid re-sending the whole chat every turn, + and seed each sequence with a small snippet instead of the full article. + + NEW: + - We track timing & token counts across the entire pass (all sequences/questions). + - We emit .perf.json sidecars next to generated .json/.log.json. + - We mirror those metrics into SQLite for continuation / benchmarking. + """ + if client is None: + raise ValueError( + "run_query_model requires an ollama.Client via the 'client' argument." + ) + + pass_name = "query_chat" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + prompt = common_prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_log_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log.json" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_name}:{model_name}] generating (fast chat mode)…") + + # Define your Q&A list once (same as your original) but as Python dicts for direct JSON schema passing. + # NOTE: We’ll construct outlines.JsonSchema only if you still want stricter client-side validation. + questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] + if chat_prompts == "optimized": + questions_to_schema = [ + ( + "is_seq", + "Check the entire snippet. Is the provided sequence (or that exact string) presented as a hybridization probe in this article snippet? Return true only if it's a probe (or its explicit part).", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Return the full probe string in IUPAC-normalized format, including 5'/3' and labels if present (fluorophore first, quencher last). Return null if not applicable.", + {"type": ["string", "null"], "minLength": 5, "maxLength": 150}, + ), + ( + "sequence_normalized", + "Return the same probe with explicit 5' and 3' bounds, e.g., 5'-FAM-ACGT...-BHQ1-3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "sequence_expanded", + "Return the expanded IUPAC probe (no parentheses in backbone), with 5'/3' bounds and labels if present. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + # "pattern": r"^5'-([A-Za-z0-9_'\-]*-)?([A-Za-z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9']*?)(-[A-Za-z0-9_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "sequence_backbone", + "Return backbone only (no labels/mods), 5'…3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", + }, + ), + ( + "sequence_backbone_expanded", + "Return backbone expanded only (no labels/mods), 5'…3'. Return null if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), + ( + "fluorophore", + "Return fluorophore (uppercase, alnum, apostrophe ok), or null.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Return quencher (uppercase, alnum, apostrophe ok), or null.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Return array of modifications with 5'→3' positions; [] if none.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": {"type": "integer", "minimum": 1}, + "modification_type": { + "type": "string", + "minLength": 1, + "maxLength": 100, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, + }, + }, + ), + ( + "target_raw", + "Describe the intended target for this probe (gene/region/context).", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "If article prints the exact target sequence, return it in 5'…3' bounds; else null.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "primers", + "Return primer sequences in IUPAC normalized 5'…3' bounds; use null for missing.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + # "pattern": r"^5'-([A-Za-z0-9()_'\-]*-)?([A-Za-z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[A-Za-z0-9()']*?)(-[A-Za-z0-9()_'\-]*)?-3'$", + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + }, + }, + ), + ("pH", "Return pH if stated; else null.", {"type": ["number", "null"]}), + ( + "annealing_raw", + "Return the raw annealing description string found; if absent, explain why in one sentence.", + {"type": "string", "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Return melting temperature as {value, unit} (e.g., 58 °C), or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Return Tris as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Return Na as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Return K as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Return Mg as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Return DMSO as {value, unit}, or null.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Return true if article explicitly says this probe successfully hybridized, false if explicitly failed, or null if not stated.", + {"type": ["boolean", "null"]}, + ), + ] + elif chat_prompts == "my": + questions_to_schema = [ + ( + "is_seq", + "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text? Put true here if and only if this sequence is being described and presented as a hybridization probe. If that's a random abbreviation or nucleotide-looking string which is not a hybridization probe or otherwise not a hybridization probe, put false here.", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Provide this sequence fully as a probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + }, + ), + ( + "sequence_normalized", + "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "sequence_expanded", + "Provide this probe sequence in expanded IUPAC format (with all repeats expanded and no parentheses in the probe sequence backbone body): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9_'-]*-)?([a-zA-Z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9']*?)(-[a-zA-Z0-9_'-]*)?-3'$", + }, + ), + ( + "sequence_backbone", + "Now provide only the probe sequence body from 5' to 3', without any fluorophores, modifications and quenchers. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", + }, + ), + ( + "sequence_backbone_expanded", + "Now provide only the expanded probe sequence body from 5' to 3' with all repeats expanded, without any fluorophores, modifications and quenchers. Use capital Latin letters, digits, dashes and apostrophy. Only the expanded backbone of probe sequence body. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), + ( + "fluorophore", + "Provide the fluorophore of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Provide the quencher of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Now provide the modifications of the probe sequence as an array, where each element is a modification and its position in 5'-3' direction. Use Latin letters, digits and dashes, you may also use parentheses and apostrophy. Provide an empty array if not present in the article text.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": { + "type": "integer", + "minimum": 1, + }, + "modification_type": { + "type": "string", + "maxLength": 100, + "minLength": 1, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, + }, + }, + ), + ( + "target_raw", + "Describe the target to which this probe was designed to hybridize.", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "Now provide the target sequence to which this probe should hybridize, from 5' to 3'. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable or if the exact sequence is not present in the article text.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "primers", + "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + }, + }, + ), + ( + "pH", + "Describe the pH in this experiment. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + {"type": ["number", "null"]}, + ), + ( + "annealing_raw", + "Describe the annealing in this experiment. Provide the raw description string. If that's can't be inferred from the whole article text, explain why.", + {"type": ["string"], "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Describe the melting temperature in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Describe the amount of Tris in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Describe the amount of Na (Sodium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Describe the amount of K (Potassium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Describe the amount of Mg (Magnesium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Describe the amount of DMSO in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Describe the outcome of this hybridization experiment based on the article text. Put true in case of successful hybridization of this probe to target, put false in case of unsuccessful and put null if this information is not present in the article.", + {"type": ["boolean", "null"]}, + ), + ] + else: + raise ValueError("Chat prompts must either be 'my' or 'optimized'") + + answers_log: List[Dict[str, Any]] = [] + described_sequences: List[Tuple[str, Dict[str, Any]]] = [] + + # PERF tracking for the entire sequence-descriptor pass: + perf_start_dt = datetime.now(timezone.utc) + agg_prompt_tokens = 0 + agg_completion_tokens = 0 + have_token_info = False + + try: + for seq in tqdm( + sequences, + desc=f"Found sequences in {article_stem}", + position=tqdm_position, + leave=False, + ): + # Slice a small, relevant article window for this sequence + snippet = extract_relevant_snippet(article_text, seq, window=1400) + + # Build a short system prompt (article is only injected ONCE here) + sys_prompt = ( + prompt + + "\n\nYou will answer a series of short JSON-only questions about a SINGLE candidate probe sequence.\n" + + "You MUST base answers ONLY on this article text:\n
\n" + + article_text + + "\n
\n" + + f"And the most relevant snippet seems to be \nsnippet\n\n\n" + + "The candidate for being a probe sequence is:\n\n" + + seq + + "\n\nAnd you must bow work with only this sequence and all relevant context for it. You will be asked a series of questions about this sequence.\n" + + "Return strictly JSON for each question — no extra commentary. You will receive a JSON schema in each question." + ) + + # Create a fresh stateful session for THIS sequence (keeps context across questions) + chat = OllamaJSONChat( + client=client, + model_name=model_name, + system_prompt=sys_prompt, + options=ollama_parameters, + keep_alive="2m", + logger=logger, + use_schema_format=True, # will auto-downgrade if not supported + ) + + seq_desc: Dict[str, Any] = {} + for param, query, schema in tqdm( + questions_to_schema, + desc=f"Questions for {seq[:24]}…", + position=tqdm_position + 1, + leave=False, + ): + try: + user_msg = ( + query + + "\nReturn ONLY valid JSON matching this schema:\n" + + json.dumps(schema, ensure_ascii=False) + ) + q_raw_json = chat.ask_json(user_msg, schema=schema) + # Best-effort repair + parse + fixed = repair_json(q_raw_json) + obj = json.loads(fixed) + + # PERF gather from last_meta + meta = getattr(chat, "_last_meta", {}) or {} + pt = meta.get("prompt_eval_count") + ct = meta.get("eval_count") + if pt is not None or ct is not None: + have_token_info = True + if pt is not None: + agg_prompt_tokens += pt + if ct is not None: + agg_completion_tokens += ct + + # Persist logs + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> {query}\n< {q_raw_json}\n\n") + + validator = Draft202012Validator(schema) + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + try: + expected_type = schema.get("type") + if expected_type is not None and (expected_type != "object"): + probable_value = str(obj.get("value", obj.get("type"))) + try: + keys = obj.keys() + probable_value = obj[keys[0]] + except: + pass + if probable_value is not None: + validator_easy = Draft202012Validator(schema) + errors_easy = sorted( + validator_easy.iter_errors(probable_value), + key=lambda er: er.path, + ) + if not errors_easy: + with open( + raw_txt_path, + mode="at", + encoding="utf-8", + ) as f: + f.write( + f"> FIX_EASY\n< {probable_value}\n\n" + ) + answers_log.append( + { + "sequence": seq, + "param": param, + "response": obj, + "fixed_response": probable_value, + } + ) + seq_desc[param] = probable_value + continue + except Exception as e: + logger.exception("Failed to easily-fix an object") + # Fallback path using outlines for schema repair + fix_chat = outlines.inputs.Chat() + fix_chat.add_system_message( + prompt + + f"\nIn this chat you have to transform the user-provided JSON object to match the following schema:\n```json\n{json.dumps(schema)}\n```\n. If user provided-data is not enough to fill-in some fields, put null value in them, but try harder to transform as much data to the new schema as possible. Please do not modify or invent values by yourself. Just move existing values to the corresponging fields of the schema. Please be thoughtful and careful while doing so!" + ) + fix_chat.add_user_message(q_raw_json) + try: + format_fixed_raw_json = think_generate( + model=model, + model_input=fix_chat, + logger=logger, + output_type=JsonSchema(schema=schema), + think=True, + options=ollama_parameters, + ) + except ollama.ResponseError: + logger.exception( + f"Error on model {model.model_name}, sequence {seq}, query {query} and prompts {chat_prompts}" + ) + print("", flush=True) + format_fixed_raw_json = q_raw_json + + # Persist logs + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write( + f"> FIX_PROMPT\n< {format_fixed_raw_json}\n\n" + ) + + format_fixed = repair_json(format_fixed_raw_json) + fixed_obj = json.loads(format_fixed) + + answers_log.append( + { + "sequence": seq, + "param": param, + "response": obj, + "fixed_response": fixed_obj, + } + ) + seq_desc[param] = fixed_obj + else: + answers_log.append( + {"sequence": seq, "param": param, "response": obj} + ) + seq_desc[param] = obj + except Exception as e: + logger.exception( + f"Exception on sequence {seq} during question '{param}'" + ) + with open(err_log_path, mode="at", encoding="utf-8") as ef: + ef.write(f"[{seq}] {param} error: {repr(e)}\n") + + described_sequences.append((seq, seq_desc)) + + finally: + # Write log + output JSONs (original behavior) + json_log_path.write_text( + json.dumps(answers_log, indent=2, ensure_ascii=False), encoding="utf-8" + ) + json_out_path.write_text( + json.dumps( + {s: d for (s, d) in described_sequences}, indent=2, ensure_ascii=False + ), + encoding="utf-8", + ) + + # PERF sidecar + DB insert for this pass. + perf_end_dt = datetime.now(timezone.utc) + # token stats aggregated over Q&A: + prompt_tokens = agg_prompt_tokens if have_token_info else None + completion_tokens = agg_completion_tokens if have_token_info else None + + # pick user-facing pass label + pass_label = ( + "SeqDesc-OPTIM" if chat_prompts == "optimized" else "SeqDesc-MY" + ) + + if article_name is None: + article_name = article_stem + + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=pass_label, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + db_path=db_path, + logger=logger, + notes=f"{pass_label} per-article descriptor map", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + + try: + _write_perf_sidecar_and_db( + artifact_path=json_log_path, + pass_name=pass_label, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + db_path=db_path, + logger=logger, + notes=f"{pass_label} Q&A log", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_log_path}: {repr(e)}" + ) + + return described_sequences + + +# ────────────────────────────────────────────────────────────────────── +# Outlines runner +# ────────────────────────────────────────────────────────────────────── + + +def _now_stamp() -> str: + return datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f") + + +def think_generate( + model: outlines.models.ollama.Ollama, + model_input: outlines.inputs.Chat | str | list, + logger: logging.Logger, + output_type: Optional[Any] = None, + think: bool = True, + **kwargs: Any, +) -> str: + if think: + try: + logger.debug("Trying thinking mode") + response = model.generate( + model_input=model_input, output_type=output_type, think=True, **kwargs + ) + return response + except ollama.ResponseError: + logger.warning( + f"Seems that model {model.model_name} does not support thinking." + ) + + logger.debug("Trying non-thinking mode") + response = model.generate( + model_input=model_input, output_type=output_type, think=False, **kwargs + ) + + return response + + +def run_single_pass( + model: Any, + article_text: str, + pass_cfg: PassConfig, + out_base: Path, + article_stem: str, + tools: List[Any], + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB +) -> Dict[str, Any]: + """Run one pass (schema+prompt from files), save raw+json+log, return object. + + NEW: + - We track timing for this single pass. + - We emit .perf.json containing timing & token stats. + - We mirror those metrics to SQLite via insert_pipeline_artifact(). + """ + perf_start_dt = datetime.now(timezone.utc) + + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + js = JsonSchema(pass_cfg.schema_path.read_text(encoding="utf-8")) + validator = Draft202012Validator(json.loads(js.schema)) + prompt = pass_cfg.prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_cfg.name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_cfg.name}:{model_name}] generating …") + response = "" + try: + response = think_generate( + model=model, + model_input=prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + output_type=js, + options=ollama_parameters, + logger=logger, + keep_alive="30s", + ) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + raw_txt_path.write_text(response, encoding="utf-8") + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] JSON parse error") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) + raise + + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + msg = "\n".join(str(e) for e in errors) + logger.error(f"[{pass_cfg.name}:{model_name}] validation errors:\n{msg}") + err_log_path.write_text( + f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", + encoding="utf-8", + ) + else: + logger.info(f"[{pass_cfg.name}:{model_name}] validation OK") + logger.info(f"[{pass_cfg.name}] validation OK [{model_name}]") + + json_out_path.write_text( + json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) + + # PERF sidecar + DB insert + perf_end_dt = datetime.now(timezone.utc) + if article_name is None: + article_name = article_stem + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=pass_cfg.name, + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=None, # Outlines doesn't expose token counts directly + completion_tokens=None, + db_path=db_path, + logger=logger, + notes=f"{pass_cfg.name} single-pass extraction", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + + return obj + + +def run_construct_single_experiment_pass( + model: Any, + article_text: str, + sequence: str, + sequence_id: int, + pass_cfg: PassConfig, + out_base: Path, + article_stem: str, + tools: List[Any], + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, + db_path: Optional[Path] = None, # NEW: for perf sidecar + DB + article_name: Optional[str] = None, # NEW: for perf sidecar + DB +) -> Dict[str, Any]: + """Run one pass (schema+prompt from files), save raw+json+log, return object. + + NEW: + - Perf timing + sidecar + DB artifact row (per sequence_id). + """ + perf_start_dt = datetime.now(timezone.utc) + + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + js = JsonSchema(pass_cfg.schema_path.read_text(encoding="utf-8")) + validator = Draft202012Validator(json.loads(js.schema)) + prompt = pass_cfg.prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_cfg.name}__{sequence_id}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_cfg.name}:{model_name}] generating …") + response = "" + try: + response = think_generate( + model=model, + model_input=outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + }, + { + "role": "user", + "content": "Let's describe a single nucleotide sequence!", + }, + { + "role": "assistant", + "content": "Sure! Let's describe one! But before we start, could you please tell me in which format you would like me to provide you an answer?", + }, + { + "role": "user", + "content": "Great question! I would like your answer to satisfy the following JSON schema:\n```json" + + js.schema + + "\n```\n\nIs it OK?", + }, + { + "role": "assistant", + "content": "Absolutely! Now please provide the nucleotide sequence you want me to describe in terms of tthe hybridization experiment design and I will provide you its description strictly following your provided JSON schema!", + }, + { + "role": "user", + "content": sequence, + }, + ] + ), + logger=logger, + output_type=js, + options=ollama_parameters, + keep_alive="30s", + ) + + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + raw_txt_path.write_text(response, encoding="utf-8") + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_cfg.name}:{model_name}] JSON parse error") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) + raise + + errors = sorted(validator.iter_errors(obj), key=lambda er: er.path) + if errors: + msg = "\n".join(str(e) for e in errors) + logger.error(f"[{pass_cfg.name}:{model_name}] validation errors:\n{msg}") + err_log_path.write_text( + f"VALIDATION ERRORS:\n{msg}\nJSON:\n{json.dumps(obj, indent=2)}", + encoding="utf-8", + ) + else: + logger.info(f"[{pass_cfg.name}:{model_name}] validation OK") + logger.info(f"[{pass_cfg.name}] validation OK [{model_name}]") + + json_out_path.write_text( + json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) + + # PERF sidecar + DB insert + perf_end_dt = datetime.now(timezone.utc) + if article_name is None: + article_name = article_stem + try: + _write_perf_sidecar_and_db( + artifact_path=json_out_path, + pass_name=f"{pass_cfg.name}__{sequence_id}", + model_name=model_name, + article_name=article_name, + start_time=perf_start_dt, + end_time=perf_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=db_path, + logger=logger, + notes=f"{pass_cfg.name} construct_single_experiment_pass for sequence {sequence_id}", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {json_out_path}: {repr(e)}" + ) + + return obj + + +def run_query_model( + model: Any, + article_text: str, + sequences: List[str], + out_base: Path, + article_stem: str, + common_prompt_path: Path, + logger: logging.Logger, + ollama_parameters: Dict[str, Any], + model_name: str, + tqdm_position: int = 0, +) -> List[Tuple[str, Any]]: + """Run one pass (schema+prompt from files), save raw+json+log, return object.""" + pass_name = "query_chat" + txt_dir = out_base / "txt" + json_dir = out_base / "json" + log_dir = out_base / "logs" + for d in (txt_dir, json_dir, log_dir): + d.mkdir(parents=True, exist_ok=True) + + prompt = common_prompt_path.read_text(encoding="utf-8") + + stamp = _now_stamp() + raw_txt_path = ( + txt_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.txt" + ) + json_log_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log.json" + ) + json_out_path = ( + json_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.json" + ) + err_log_path = ( + log_dir + / f"{article_stem}__{pass_name}__{model_name_encode(model_name)}__{stamp}.log" + ) + + logger.info(f"[{pass_name}:{model_name}] generating …") + + def ask_with_schema(chat_messages: outlines.inputs.Chat, schema: JsonSchema): + response = "" + try: + response = think_generate( + model=model, + model_input=chat_messages, + output_type=schema, + options=ollama_parameters, + logger=logger, + keep_alive="30s", + ) + except Exception as e: + logger.exception(f"[{pass_name}:{model_name}] stream error") + err_log_path.write_text(f"STREAM ERROR:\n{e}\n", encoding="utf-8") + raise + + with open(raw_txt_path, mode="at", encoding="utf-8") as f: + f.write(f"> {chat_messages.messages[-1]}\n< ") + f.write(response) + f.write("\n\n") + + try: + fixed = repair_json(response) + obj = json.loads(fixed) + except Exception as e: + logger.exception(f"[{pass_name}:{model_name}] JSON parse error") + err_log_path.write_text( + f"JSON ERROR:\n{e}\nRAW:\n{response}\n", encoding="utf-8" + ) + raise + + return obj, response + + base_chat = outlines.inputs.Chat( + [ + { + "role": "system", + "content": prompt + + "\n" + + "And here is the article text you must base your answer on:\n\n
\n" + + article_text + + "\n<\\article>\n", + } + ] + ) + answers = [] + + try: + + def parse_sequence(seq: str, base_chat: outlines.inputs.Chat): + chat = outlines.inputs.Chat(base_chat.messages) + questions_to_schema: List[Tuple[str, str, Dict[str, Any]]] = [ + ( + "is_seq", + "Check the whole article text. Is your picked sequence really a probe sequence or a part of probe sequence in this article text? Put true here if and only if this sequence is being described and presented as a hybridization probe. If that's a random abbreviation or nucleotide-looking string which is not a hybridization probe or otherwise not a hybridization probe, put false here.", + {"type": "boolean"}, + ), + ( + "sequence_full", + "Provide this sequence fully as a probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + }, + ), + ( + "sequence_normalized", + "Provide this probe sequence in IUPAC-normalized format: from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "sequence_expanded", + "Provide this probe sequence in expanded IUPAC format (with all repeats expanded and no parentheses in the probe sequence backbone body): from 5' to 3' end, with fluorophore and quencher. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9_'-]*-)?([a-zA-Z0-9']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9']*?)(-[a-zA-Z0-9_'-]*)?-3'$", + }, + ), + ( + "sequence_backbone", + "Now provide only the probe sequence body from 5' to 3', without any fluorophores, modifications and quenchers. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9()]{5,})-3'$", + }, + ), + ( + "sequence_backbone_expanded", + "Now provide only the expanded probe sequence body from 5' to 3' with all repeats expanded, without any fluorophores, modifications and quenchers. Use capital Latin letters, digits, dashes and apostrophy. Only the expanded backbone of probe sequence body. Put null here if not applicable.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([ACGUTRYSWKMBDHVN0-9]{5,})-3'$", + }, + ), + ( + "fluorophore", + "Provide the fluorophore of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "quencher", + "Provide the quencher of this probe. Use capital Latin letters, digits and dashes, you may also use an apostrophy. Put null here if not applicable or not present in the text of the article.", + { + "type": ["string", "null"], + "minLength": 3, + "maxLength": 150, + "pattern": r"^[A-Z0-9']{3,}$", + }, + ), + ( + "modifications", + "Now provide the modifications of the probe sequence as an array, where each element is a modification and its position in 5'-3' direction. Use Latin letters, digits and dashes, you may also use parentheses and apostrophy. Provide an empty array if not present in the article text.", + { + "type": "array", + "minItems": 0, + "maxItems": 150, + "items": { + "type": "object", + "additionalProperties": False, + "required": [ + "modification_position", + "modification_type", + "modification_description", + ], + "properties": { + "modification_position": { + "type": "integer", + "minimum": 1, + }, + "modification_type": { + "type": "string", + "maxLength": 100, + "minLength": 1, + }, + "modification_description": { + "type": "string", + "minLength": 1, + "maxLength": 150, + }, + }, + }, + }, + ), + ( + "target_raw", + "Describe the target to which this probe was designed to hybridize.", + {"type": "string", "minLength": 5, "maxLength": 250}, + ), + ( + "target_normalized", + "Now provide the target sequence to which this probe should hybridize, from 5' to 3'. Use capital Latin letters, digits and dashes, you may also use parentheses and apostrophy. Put null here if not applicable or if the exact sequence is not present in the article text.", + { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + ), + ( + "primers", + "Describe the primer sequences in IUPAC-normalized format, each from 5' to 3' end. Use capital Latin letters, digits and dashes, parentheses and apostrophy. Put null to the primer if it is not present in the article text.", + { + "type": "object", + "additionalProperties": False, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + "reverse": { + "type": ["string", "null"], + "minLength": 5, + "maxLength": 150, + "pattern": r"^5'-([a-zA-Z0-9(_)'-]*-)?([a-zA-Z0-9()']*?[ACGUTRYSWKMBDHVN]{5,}[a-zA-Z0-9()']*?)(-[a-zA-Z0-9(_)'-]*)?-3'$", + }, + }, + }, + ), + ( + "pH", + "Describe the pH in this experiment. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + {"type": ["number", "null"]}, + ), + ( + "annealing_raw", + "Describe the annealing in this experiment. Provide the raw description string. If that's can't be inferred from the whole article text, explain why.", + {"type": ["string"], "minLength": 10, "maxLength": 250}, + ), + ( + "T", + "Describe the melting temperature in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Tris", + "Describe the amount of Tris in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Na", + "Describe the amount of Na (Sodium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "K", + "Describe the amount of K (Potassium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "Mg", + "Describe the amount of Mg (Magnesium) in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "DMSO", + "Describe the amount of DMSO in this experiment and provide the measurement unit. Only put null here if this information is not present in the article text and can't be inferred from the whole article text.", + { + "type": ["object", "null"], + "additionalProperties": False, + "required": ["value", "unit"], + "properties": { + "value": {"type": "number"}, + "unit": {"type": "string", "minLength": 1, "maxLength": 10}, + }, + }, + ), + ( + "outcome", + "Describe the outcome of this hybridization experiment based on the article text. Put true in case of successful hybridization of this probe to target, put false in case of unsuccessful and put null if this information is not present in the article.", + {"type": ["boolean", "null"]}, + ), + ] + + seq_desc: Dict[str, Any] = dict() + + for param, query, schema in tqdm( + questions_to_schema, + desc=f"Questions to the sequence {seq} in {article_stem}", + position=tqdm_position + 1, + leave=False, + ): + try: + chat.add_user_message( + query + + "\nAnd here is the schema yout answer has to follow:\n```json\n" + + json.dumps(schema) + + "```\n" + ) + response, raw = ask_with_schema( + chat_messages=chat, schema=JsonSchema(schema) + ) + answers.append({"seq": seq, "param": param, "response": response}) + seq_desc[param] = response + chat.add_assistant_message(raw) + except Exception as e: + logger.exception( + f"Exception on sequence {seq} during query: {query}", e + ) + return seq_desc + + described_sequences: List[Tuple[str, Dict[str, Any]]] = [] + for seq in tqdm( + sequences, + desc=f"Found sequences in {article_stem}", + position=tqdm_position, + leave=False, + ): + base_chat_with_sequence = outlines.inputs.Chat(base_chat.messages) + base_chat_with_sequence.add_user_message( + "Let's pick and analyze a single probe sequence from the article text. Provide the probe sequence which we will describe in all the following messages." + ) + base_chat_with_sequence.add_assistant_message(seq) + base_chat_with_sequence.add_user_message( + f"Great choice! Let's analyze nucleotidic sequence {seq} for the rest of this chat!" + ) + try: + sequence_descriptor = parse_sequence( + seq, base_chat=base_chat_with_sequence + ) + described_sequences.append((seq, sequence_descriptor)) + answers.append( + {"sequence": seq, "sequence_descriptor": sequence_descriptor} + ) + except Exception as e: + logger.exception( + f'[{pass_name}:{model_name}] Sequence "{seq}" error computing descriptor' + ) + err_log_path.write_text( + f'[{pass_name}:{model_name}] Sequence "{seq}" error computing descriptor', + encoding="utf-8", + ) + finally: + json_log_path.write_text( + json.dumps(answers, indent=2, ensure_ascii=False), encoding="utf-8" + ) + json_out_path.write_text( + json.dumps(described_sequences, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + return described_sequences + + +# ────────────────────────────────────────────────────────────────────── +# Stitcher (to your full object) +# ────────────────────────────────────────────────────────────────────── + + +def _merge_reports(*reports: Optional[Dict[str, Any]]) -> Dict[str, Any]: + out = {"missing": [], "uncertain": [], "notes": None} + notes = [] + for r in reports: + if not r: + continue + out["missing"].extend(r.get("missing") or []) + out["uncertain"].extend(r.get("uncertain") or []) + if r.get("notes"): + notes.append(str(r["notes"])) + out["missing"] = list(dict.fromkeys(out["missing"])) + out["uncertain"] = list(dict.fromkeys(out["uncertain"])) + out["notes"] = " | ".join(notes) if notes else None + return out + + +def _to_si( + value: Optional[float], unit: Optional[str] +) -> Tuple[Optional[float], Optional[str]]: + return to_si(value, unit) + + +def _to_measurement_full(m_lite: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + if not m_lite: + return None + raw = m_lite.get("raw") or "" + value = m_lite.get("value") + unit = m_lite.get("unit") + si_value, si_unit = ( + _to_si(value, unit) + if (value is not None and unit is not None) + else (None, None) + ) + return { + "raw": raw, + "value": value, + "unit": unit, + "si_value": si_value, + "si_unit": si_unit, + "assumptions": None, + "provenance": { + "source_type": "unknown", + "page": None, + "section": None, + "quote": None, + "notes": None, + }, + } + + +def _detect_sa_from_name(name: Optional[str]) -> Optional[str]: + if not name: + return None + n = name.strip().lower() + if n.endswith(")as"): + return "antisense" + if n.endswith(")s"): + return "sense" + return None + + +def _coerce_sa(value: Optional[str], name: Optional[str]) -> Optional[str]: + m = { + "s": "sense", + "as": "antisense", + "sense": "sense", + "antisense": "antisense", + "+": "sense", + "-": "antisense", + } + if value is None or (isinstance(value, str) and not value.strip()): + return _detect_sa_from_name(name) + return m.get(str(value).strip().lower(), _detect_sa_from_name(name)) + + +def _to_oligo_full(ol: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + if not ol: + return None + return { + "raw": ol.get("raw") or "", + "sequence": ol.get("sequence"), + "length_bases": ol.get("length_bases"), + "prime_prefix": None, + "five_prime_label": ol.get("five_prime_label"), + "three_prime_label": ol.get("three_prime_label"), + "labels": [], + "sense_antisense": ol.get("sense_antisense"), + "provenance": { + "source_type": "unknown", + "page": None, + "section": None, + "quote": None, + "notes": None, + }, + } + + +def stitch_full( + A_core: Dict[str, Any], + B_index: Dict[str, Any], + C_sequences: Dict[str, Any], + D_parameters: Dict[str, Any], + E_outcomes: Dict[str, Any], + F_pairings: Dict[str, Any], +) -> Dict[str, Any]: + core = { + "doi": A_core.get("doi"), + "abstract": A_core.get("abstract"), + "topic": A_core.get("topic"), + } + E: Dict[str, Dict[str, Any]] = {} + for e in B_index.get("experiments") or []: + E[e["id_exp"]] = { + "id_exp": e["id_exp"], + "raw_description": e.get("raw_description"), + "type": e.get("type"), + "description": e.get("description"), + "metadata": {}, + "sequences": {}, + "experiment_properties": {}, + "outcome": {}, + "pairing": {}, + "extraction_report": {"missing": [], "uncertain": [], "notes": None}, + } + + for item in C_sequences.get("items") or []: + ie = item["id_exp"] + if ie not in E: + continue + prb = item.get("probe") or {} + seqs = {} + seqs["probe"] = { + "name": prb.get("name"), + "amplicon_id": prb.get("amplicon_id"), + "oligo": _to_oligo_full(prb.get("oligo")), + "fluorophore": prb.get("fluorophore"), + "quencher": prb.get("quencher"), + "sense_antisense": _coerce_sa(prb.get("sense_antisense"), prb.get("name")), + "notes": prb.get("notes"), + } + tgt = item.get("target_sequence") + seqs["target_sequence"] = _to_oligo_full(tgt) if tgt is not None else None + pr = item.get("primer_sequences") + if isinstance(pr, dict): + seqs["primer_sequences"] = { + "forward": _to_oligo_full(pr.get("forward")), + "reverse": _to_oligo_full(pr.get("reverse")), + } + else: + seqs["primer_sequences"] = None + rels = [] + for rs in item.get("related_sequences") or []: + rels.append( + { + "related_sequence": _to_oligo_full(rs.get("related_sequence")), + "description": rs.get("description"), + } + ) + seqs["related_sequences"] = rels + E[ie]["sequences"] = seqs + + for item in D_parameters.get("items") or []: + ie = item["id_exp"] + if ie not in E: + continue + MD: Dict[str, Any] = {} + _md = item.get("metadata") or {} + MD["organism"] = _md.get("organism") + MD["technology"] = _md.get("technology") + ann = _md.get("annealing") + if ann is None: + MD["annealing"] = None + elif isinstance(ann, dict): + MD["annealing"] = { + "quantitative": _to_measurement_full(ann.get("quantitative")), + "qualitative": ann.get("qualitative"), + } + else: + MD["annealing"] = None + MD["pH"] = _to_measurement_full(_md.get("pH")) + ri = _md.get("rna_impurities") + if ri is None: + MD["rna_impurities"] = None + elif isinstance(ri, dict): + MD["rna_impurities"] = { + "quantitative": _to_measurement_full(ri.get("quantitative")), + "qualitative": ri.get("qualitative"), + } + else: + MD["rna_impurities"] = None + + EP: Dict[str, Any] = {} + concs = (item.get("experiment_properties") or {}).get("concentrations") or {} + EP["concentrations"] = { + "dna_rna_concentration": _to_measurement_full( + concs.get("dna_rna_concentration") + ), + "concentration_SI": _to_measurement_full(concs.get("concentration_SI")), + } + pars = (item.get("experiment_properties") or {}).get("parameters_SI") or {} + EP["parameters_SI"] = { + "temperature": _to_measurement_full(pars.get("temperature")), + "Tris": _to_measurement_full(pars.get("Tris")), + "Na": _to_measurement_full(pars.get("Na")), + "K": _to_measurement_full(pars.get("K")), + "Mg": _to_measurement_full(pars.get("Mg")), + "DMSO": _to_measurement_full(pars.get("DMSO")), + } + E[ie]["metadata"] = MD + E[ie]["experiment_properties"] = EP + + for item in E_outcomes.get("items") or []: + ie = item["id_exp"] + if ie not in E: + continue + E[ie]["outcome"] = { + "outcome": item.get("outcome"), + "fluorescence": _to_measurement_full(item.get("fluorescence")), + "comparative_notes": item.get("comparative_notes"), + } + + for item in F_pairings.get("items") or []: + ie = item["id_exp"] + if ie not in E: + continue + E[ie]["pairing"] = { + "paired_with_probe_name": item.get("paired_with_probe_name"), + "relationship": item.get("relationship"), + } + + full_report = _merge_reports( + A_core.get("extraction_report"), + B_index.get("extraction_report"), + C_sequences.get("extraction_report"), + D_parameters.get("extraction_report"), + E_outcomes.get("extraction_report"), + F_pairings.get("extraction_report"), + ) + + return { + "doi": core["doi"], + "abstract": core["abstract"], + "topic": core["topic"], + "experiments": list(E.values()), + "extraction_report": full_report, + } + + +def _deep_merge_keep_left(a, b): + """Shallow-friendly deep merge: keep a's non-null scalars; use b if a is None. + - Dicts: recurse. + - Lists: concatenate (no dedup). + - Scalars: prefer a unless a is None/empty, then b. + """ + if a is None: + return b + if b is None: + return a + if isinstance(a, dict) and isinstance(b, dict): + out = dict(a) + for k, bv in b.items(): + av = out.get(k) + out[k] = _deep_merge_keep_left(av, bv) if k in out else bv + return out + if isinstance(a, list) and isinstance(b, list): + return a + b + # prefer a unless it's falsy and b is truthy + return a if a not in (None, "", []) else b + + +def aggregate_c_outputs(outputs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: + """Build a consolidated C_sequences object from any of: C_sequences, C1_probe_core, C2_target_primers, C3_related.""" + # Start with single-pass C if present + base = outputs.get("C_sequences") or { + "items": [], + "extraction_report": {"missing": [], "uncertain": [], "notes": None}, + } + + # Build item index by id_exp from base + items_map: Dict[str, Dict[str, Any]] = {} + for it in base.get("items", []): + if not isinstance(it, dict) or "id_exp" not in it: + continue + items_map[it["id_exp"]] = dict(it) + + def _merge_from(pass_name: str, fields: List[str]): + obj = outputs.get(pass_name) + if not isinstance(obj, dict): + return + for it in obj.get("items", []): + if not isinstance(it, dict) or "id_exp" not in it: + continue + ie = it["id_exp"] + tgt = items_map.setdefault(ie, {"id_exp": ie}) + for f in fields: + if f in it: + tgt[f] = _deep_merge_keep_left(tgt.get(f), it[f]) + + # merge extraction report + br = base.get("extraction_report") or { + "missing": [], + "uncertain": [], + "notes": None, + } + er = obj.get("extraction_report") or { + "missing": [], + "uncertain": [], + "notes": None, + } + br["missing"] = list((br.get("missing") or []) + (er.get("missing") or [])) + br["uncertain"] = list( + (br.get("uncertain") or []) + (er.get("uncertain") or []) + ) + br_notes = [n for n in [br.get("notes"), er.get("notes")] if n] + br["notes"] = " | ".join(br_notes) if br_notes else None + base["extraction_report"] = br + + # Merge micro-passes over base (C1, C2, C3). The field names match your schemas. + _merge_from("C1_probe_core", ["probe"]) + _merge_from("C2_target_primers", ["target_sequence", "primer_sequences"]) + _merge_from("C3_related", ["related_sequences"]) + + # Produce consolidated list + merged_items = list(items_map.values()) + # Normalize: ensure all top-level keys exist for stitch_full + for it in merged_items: + it.setdefault("probe", None) + it.setdefault("target_sequence", None) + it.setdefault("primer_sequences", None) + it.setdefault("related_sequences", []) + + return { + "items": merged_items, + "extraction_report": base.get("extraction_report") + or {"missing": [], "uncertain": [], "notes": None}, + } + + +# ────────────────────────────────────────────────────────────────────── +# Project runner +# ────────────────────────────────────────────────────────────────────── + + +def run_project(project_dir: str | Path, fresh: bool = False) -> None: + """Run the pipeline as configured by files under project_dir. + + NEW: + - fresh=False (default): continuation mode. We skip any article/model pair + that already has a successful "FULL" pass recorded in the DB + (hyb_db.pipeline_artifacts.success == 1 for pass_name "FULL"). + - fresh=True: force re-run everything. + + NOTE: + - Within a single interrupted article we may redo that article from scratch. + This is allowed per spec. + """ + project_dir = Path(project_dir) + cfg = load_pipeline_config(project_dir) + + out_base = cfg.out_dir + out_base.mkdir(parents=True, exist_ok=True) + logger = _make_logger(out_base / "logs") + + headers = dict() + if API_TOKEN is not None: + headers["Authorization"] = f"Bearer {API_TOKEN}" + + # Ollama client + Outlines model + client = ollama.Client( + host=cfg.ollama_base_url, timeout=cfg.timeout_s, headers=headers + ) + + ollama_models = client.list() + for model_name in tqdm(cfg.model_names, desc="LLM Models", position=0, leave=False): + model = outlines.from_ollama(client, model_name) + tools = [to_si, parse_oligo, make_measurement] + + # Optional full-schema validator + full_validator = None + if cfg.full_schema_path and cfg.full_schema_path.exists(): + try: + full_schema_text = cfg.full_schema_path.read_text(encoding="utf-8") + full_validator = Draft202012Validator(json.loads(full_schema_text)) + logger.info("Loaded full schema for final validation.") + except Exception: + logger.exception( + "Failed to load/parse full schema; proceeding without final validation." + ) + + logger.info(f"Article glob: {cfg.article_glob}") + + # Iterate input articles + files = sorted( + cfg.input_dir.glob(cfg.article_glob), key=lambda s: str(s).upper() + ) + logger.info(f"Files: {files}") + + for art_path in tqdm( + files, desc=f"Articles for model {model_name}", position=1, leave=False + ): + article_name = art_path.stem + + # CONTINUATION CHECK: + # If --fresh was not passed and db_path is configured, attempt to skip + # articles already fully processed for this model. + if (not fresh) and cfg.db_path: + try: + from hyb_db import get_completed_passes + + completed = get_completed_passes( + db_path=str(cfg.db_path), + model_name=model_name, + article_name=article_name, + ) + if "FULL" in completed: + logger.info( + f"[CONTINUE] Skipping {article_name} for model {model_name} (already FULL)." + ) + continue + except Exception: + logger.exception( + f"[CONTINUE] Continuation check failed for {article_name}:{model_name}; proceeding with full run." + ) + + logger.info(f"=== {article_name} : {model_name} ===") + article_text = art_path.read_text(encoding="utf-8") + + # Run configured pre-passes + outputs: Dict[str, Dict[str, Any]] = {} + for p in tqdm( + cfg.pre_passes, + desc=f"{article_name} pre-passes", + position=2, + leave=False, + ): + try: + outputs[p.name] = run_single_pass( + model=model, + article_text=article_text, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, + ) + except Exception: + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) + + strict_sequences: Set[str] = set( + map(lambda s: s.upper(), outputs.get("SeqPrompt_strict", [])) + ) + nonstrict_sequences: Set[str] = set( + map(lambda s: s.upper(), outputs.get("SeqPrompt", [])) + ) + + all_found_sequences = list( + sorted( + strict_sequences.union(nonstrict_sequences), + key=lambda s: (0 if s in strict_sequences else 1), + ) + ) + all_found_sequences_str = ", ".join(all_found_sequences) + logger.info("Pre-passes done, found sequences: " + all_found_sequences_str) + + for p in tqdm( + cfg.passes, desc=f"{article_name} passes", leave=False, position=2 + ): + try: + outputs[p.name] = run_single_pass( + model=model, + article_text=article_text, + pass_cfg=p, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, + ) + except Exception: + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) + + optimized_sequence_descriptors = run_query_model_speed_up( + model=model, # not used in the fast version but kept for signature compatibility + article_text=article_text, + sequences=all_found_sequences, + out_base=out_base, + article_stem=article_name, + common_prompt_path=cfg.common_prompt_path, + ollama_parameters=cfg.ollama_parameters, + logger=logger, + model_name=model_name, + tqdm_position=2, + client=client, # <-- important: pass the raw ollama.Client + chat_prompts="optimized", + db_path=cfg.db_path, + article_name=article_name, + ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OPTIM__{stamp}.json" + ) + # record perf for creating this full copy (aggregation-only, + # tokens not known here separately) + write_start_dt = datetime.now(timezone.utc) + full_seq_desc_path.write_text( + json.dumps( + optimized_sequence_descriptors, indent=2, ensure_ascii=False + ), + encoding="utf-8", + ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-OPTIM", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-OPTIM aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=optimized_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC OPTIM] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC OPTIM] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC OPTIM] stitching failed for {article_name} : {model_name}" + ) + + my_sequence_descriptors = run_query_model_speed_up( + model=model, # not used in the fast version but kept for signature compatibility + article_text=article_text, + sequences=all_found_sequences, + out_base=out_base, + article_stem=article_name, + common_prompt_path=cfg.common_prompt_path, + ollama_parameters=cfg.ollama_parameters, + logger=logger, + model_name=model_name, + tqdm_position=2, + client=client, # <-- important: pass the raw ollama.Client + chat_prompts="my", + db_path=cfg.db_path, + article_name=article_name, + ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-MY__{stamp}.json" + ) + write_start_dt = datetime.now(timezone.utc) + full_seq_desc_path.write_text( + json.dumps(my_sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-MY", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-MY aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=my_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC MY] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC MY] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC MY] stitching failed for {article_name} : {model_name}" + ) + + logger.warning( + "[SeqDesc-OLD] Parsing old sequence descriptors is disabled in this run." + ) + old_sequence_descriptors = [] + # old_sequence_descriptors = run_query_model( + # model=model, + # article_text=article_text, + # sequences=all_found_sequences, + # out_base=out_base, + # article_stem=article_name, + # common_prompt_path=cfg.common_prompt_path, + # ollama_parameters=cfg.ollama_parameters, + # logger=logger, + # model_name=model_name, + # tqdm_position=2, + # ) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-OLD__{stamp}.json" + ) + write_start_dt = datetime.now(timezone.utc) + full_seq_desc_path.write_text( + json.dumps(old_sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-OLD", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-OLD aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=old_sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC OLD] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC OLD] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC OLD] stitching failed for {article_name} : {model_name}" + ) + + sequence_descriptors: List[Tuple[str, Dict[str, Any]]] = [] + sequence_descriptors.extend(optimized_sequence_descriptors) + sequence_descriptors.extend(my_sequence_descriptors) + sequence_descriptors.extend(old_sequence_descriptors) + + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_seq_desc_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__SeqDesc-FULL__{stamp}.json" + ) + write_start_dt = datetime.now(timezone.utc) + full_seq_desc_path.write_text( + json.dumps(sequence_descriptors, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + write_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_seq_desc_path, + pass_name="SeqDesc-FULL", + model_name=model_name, + article_name=article_name, + start_time=write_start_dt, + end_time=write_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="SeqDesc-FULL combined aggregation file", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_seq_desc_path}: {repr(e)}" + ) + + for i, seq in enumerate( + tqdm( + all_found_sequences, + desc=f"{article_name}: sequences construction", + leave=False, + position=2, + ) + ): + for construct_pass in tqdm( + cfg.construct_single_experiment_passes, + desc="Construction schemas", + leave=False, + ): + try: + run_construct_single_experiment_pass( + model=model, + article_text=article_text, + sequence=seq, + sequence_id=i, + pass_cfg=construct_pass, + out_base=out_base, + article_stem=article_name, + tools=tools, + logger=logger, + ollama_parameters=cfg.ollama_parameters, + model_name=model_name, + db_path=cfg.db_path, + article_name=article_name, + ) + except Exception: + logger.exception( + f"Pass failed: {p.name} : {article_name} : {model_name}" + ) + + # Prepare timing for final FULL object stitching+DB + full_start_dt = datetime.now(timezone.utc) + full_path = None + + # Stitch only if the expected pass names are present + try: + A = outputs.get("A_core", {}) + B = outputs.get("B_index", {}) + # C = outputs.get("C_sequences", {}) + C = aggregate_c_outputs(outputs) + D = outputs.get("D_parameters", {}) + E = outputs.get("E_outcomes", {}) + F = outputs.get("F_pairings", {}) + full_obj = stitch_full(A, B, C, D, E, F) + + # Final validation + if full_validator: + errs = sorted( + full_validator.iter_errors(full_obj), key=lambda e: e.path + ) + if errs: + logger.error( + f"[FULL] validation errors for {article_name} : {model_name}:\n" + + "\n".join(str(e) for e in errs) + ) + else: + logger.info( + f"[FULL] validation OK for {article_name} : {model_name}" + ) + + # Save full object (timestamped) + stamp = _now_stamp() + full_dir = out_base / "json_full" + full_dir.mkdir(parents=True, exist_ok=True) + full_path = ( + full_dir + / f"{article_name}_{model_name_encode(model_name)}__FULL__{stamp}.json" + ) + full_path.write_text( + json.dumps(full_obj, indent=2, ensure_ascii=False), encoding="utf-8" + ) + logger.info( + f"[FULL] wrote {full_path.name} {article_name} : {model_name}" + ) + except Exception: + logger.exception( + f"[FULL] stitching failed for {article_name} : {model_name}" + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_article_object # your earlier module + + run_id = insert_article_object( + db_path=str(cfg.db_path), + article_obj=full_obj, + model_name=model_name, + article_name=article_name, + ) + logger.info( + f"[DB INSERT FULL] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT FULL] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT FULL] stitching failed for {article_name} : {model_name}" + ) + + try: + # Optional DB insert + if cfg.db_path: + try: + from hyb_db import insert_seqdesc_object # your earlier module + + run_id = insert_seqdesc_object( + db_path=str(cfg.db_path), + article_name=article_name, + doi=outputs.get("A_core", {}).get("doi", None), + model_name=model_name, + sequence_descriptors=sequence_descriptors, + source_path=art_path, + ) + logger.info( + f"[DB INSERT SEQDESC] inserted run_id={run_id} for {article_name} : {model_name}" + ) + except Exception: + logger.exception("[DB INSERT SEQDESC] insertion failed") + except Exception: + logger.exception( + f"[DB INSERT SEQDESC] stitching failed for {article_name} : {model_name}" + ) + + # PERF sidecar + DB artifact row for FULL artifact. + # Mark article/model as "done" for continuation purposes. + if full_path is not None: + full_end_dt = datetime.now(timezone.utc) + if cfg.db_path: + try: + _write_perf_sidecar_and_db( + artifact_path=full_path, + pass_name="FULL", + model_name=model_name, + article_name=article_name, + start_time=full_start_dt, + end_time=full_end_dt, + prompt_tokens=None, + completion_tokens=None, + db_path=cfg.db_path, + logger=logger, + notes="Final stitched article object + DB inserts", + ) + except Exception as e: + logger.exception( + f"[PERF] Failed to record perf for {full_path}: {repr(e)}" + ) + + +# Optional CLI hook (project_dir arg) +if __name__ == "__main__": + # Updated CLI to support --fresh + import argparse + + parser = argparse.ArgumentParser( + description="Run the hybridization extraction pipeline." + ) + parser.add_argument( + "project_dir", + help="Path to the project directory containing config/, passes/, inputs/, etc.", + ) + parser.add_argument( + "--fresh", + action="store_true", + help="Disable continuation / resume. Re-run all articles even if previously completed (pass 'FULL' already recorded).", + ) + + args = parser.parse_args() + run_project(args.project_dir, fresh=args.fresh) diff --git a/extraction/requirements.txt b/extraction/requirements.txt index 441c6e5..7831a6c 100644 --- a/extraction/requirements.txt +++ b/extraction/requirements.txt @@ -54,3 +54,4 @@ triton==3.3.0 typing_extensions==4.13.2 urllib3==2.4.0 json5 +json-repair \ No newline at end of file diff --git a/extraction/schemas/article.json b/extraction/schemas/article.json new file mode 100644 index 0000000..a2c9b48 --- /dev/null +++ b/extraction/schemas/article.json @@ -0,0 +1,514 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.org/schemas/hybridization-article.schema.json", + "title": "Hybridization Article", + "description": "Per-article extraction of hybridization experiments as target-probe pairs (plus primers/related sequences). Includes decorated oligos (fluorophores/quenchers, 5'/3' marks, sense/antisense), and parameters stored as raw text and normalized SI.", + "type": "object", + "unevaluatedProperties": false, + + "$defs": { + "extractionReport": { + "type": "object", + "description": "Structured way to declare missing/uncertain items to avoid hallucination. Use JSON Pointers for field locations.", + "additionalProperties": false, + "required": ["missing", "uncertain", "notes"], + "properties": { + "missing": { + "type": "array", + "description": "JSON Pointers to fields that are truly unavailable in the article.", + "items": { "type": "string", "minLength": 1 }, + "minItems": 0 + }, + "uncertain": { + "type": "array", + "description": "JSON Pointers to fields that are ambiguous or weakly supported.", + "items": { "type": "string", "minLength": 1 }, + "minItems": 0 + }, + "notes": { + "type": ["string", "null"], + "description": "Free-text clarifications, e.g., OCR issues, mapping choices." + } + } + }, + + "iupacBases": { + "type": "string", + "description": "DNA/RNA bases in uppercase IUPAC alphabet: A C G U/T R Y S W K M B D H V N. No separators and no ellipsis inside the sequence.", + "pattern": "^[ACGUTRYSWKMBDHVN]+$", + "minLength": 5, + "maxLength": 5000 + }, + + "provenance": { + "type": "object", + "description": "Where a value was obtained in the source document.", + "additionalProperties": false, + "required": ["source_type", "page", "section", "quote", "notes"], + "properties": { + "source_type": { + "type": "string", + "enum": ["pdf", "html", "other", "unknown"], + "description": "Type of source the extractor processed." + }, + "page": { + "type": ["integer", "null"], + "minimum": 1, + "description": "Page number in the source (1-based), if applicable." + }, + "section": { + "type": ["string", "null"], + "description": "Section header or caption in which the value appears." + }, + "quote": { + "type": ["string", "null"], + "description": "Short verbatim snippet that directly supports the value." + }, + "notes": { + "type": ["string", "null"], + "description": "Extractor notes (e.g., OCR artifact, inferred mapping)." + } + } + }, + + "measurement": { + "type": "object", + "description": "Numeric (or quasi-numeric) item holding raw text, optional parsed value/unit, and normalized SI value/unit.", + "additionalProperties": false, + "required": ["raw", "value", "unit", "si_value", "si_unit", "assumptions", "provenance"], + "properties": { + "raw": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Exact text as written in the article (e.g., '58 °C', '2 mM', '10%')." + }, + "value": { + "type": ["number", "null"], + "description": "Parsed numeric value if present in raw." + }, + "unit": { + "type": ["string", "null"], + "description": "Unit as written in the article (e.g., '°C', 'mM', '%')." + }, + "si_value": { + "type": ["number", "null"], + "description": "Value converted to SI. Examples: temperature in K; concentrations in mol/m^3; fractions 0-1 for percent." + }, + "si_unit": { + "type": ["string", "null"], + "enum": ["K", "mol/m^3", "Pa", "kg/m^3", "s", "dimensionless"], + "description": "SI unit after conversion." + }, + "assumptions": { + "type": ["string", "null"], + "description": "Conversion assumptions (e.g., density used, ionic strength conventions)." + }, + "provenance": { "$ref": "#/$defs/provenance" } + } + }, + + "decoratedOligo": { + "type": "object", + "description": "An oligonucleotide possibly decorated at 5'/3' with labels (fluorophores/quenchers). Keeps raw string and parsed parts.", + "additionalProperties": false, + "required": ["raw", "sequence", "length_bases", "prime_prefix", "five_prime_label", "three_prime_label", "labels", "sense_antisense", "provenance"], + "properties": { + "raw": { + "type": "string", + "minLength": 5, + "maxLength": 200, + "description": "Exact oligo string as seen. MUST CONTAIN NUCLEOTIDES, NOT ONLY NAMES. DO NOT COPY THIS SEQUENCE FROM THE EXAMPLE! NEVER USE ELLIPSIS OR SKIP ANY DATA IN YOUR RESPONSE!!!" + }, + "sequence": { + "$ref": "#/$defs/iupacBases", + "description": "Bare base sequence with IUPAC letters only (no labels/hyphens)." + }, + "length_bases": { + "type": ["integer", "null"], + "minimum": 1, + "description": "Base length if given or derivable (e.g., '(27 b)')." + }, + "prime_prefix": { + "type": ["integer", "null"], + "enum": [3, 5, null], + "description": "Leading prime marker if present (3 or 5). Accepts OCR artifacts like 50/5O/5' during parsing." + }, + "five_prime_label": { + "type": ["string", "null"], + "description": "Label at the 5' end if indicated (e.g., FAM, ROX)." + }, + "three_prime_label": { + "type": ["string", "null"], + "description": "Label at the 3' end if indicated (e.g., BHQ1, BHQ2, RTQ1)." + }, + "labels": { + "type": "array", + "description": "All labels found in textual order, including 5' and 3' labels.", + "minItems": 0, + "maxItems": 10, + "items": { "type": "string" } + }, + "sense_antisense": { + "type": ["string", "null"], + "enum": ["sense", "antisense", null], + "description": "If the oligo is explicitly designated as sense (s) or antisense (as) in the article." + }, + "provenance": { "$ref": "#/$defs/provenance" } + } + }, + + "primerPair": { + "type": "object", + "description": "PCR primer pair associated with an amplicon/experiment.", + "additionalProperties": false, + "required": ["forward", "reverse"], + "properties": { + "forward": { + "$ref": "#/$defs/decoratedOligo", + "description": "Forward primer as decorated oligo." + }, + "reverse": { + "$ref": "#/$defs/decoratedOligo", + "description": "Reverse primer as decorated oligo." + } + } + }, + + "probe": { + "type": "object", + "description": "A hybridization probe with name, optional amplicon ID, and decorated oligo details.", + "additionalProperties": false, + "required": ["name", "oligo", "amplicon_id", "fluorophore", "quencher", "sense_antisense", "notes"], + "properties": { + "name": { + "type": "string", + "minLength": 2, + "maxLength": 60, + "description": "Probe name exactly as used (e.g., 'N3-FAM(27)s')." + }, + "amplicon_id": { + "type": ["string", "null"], + "description": "Amplicon tag associated with the probe (e.g., 'K2', 'K3', 'N2', 'N3', 'B15')." + }, + "oligo": { + "$ref": "#/$defs/decoratedOligo", + "description": "The probe's decorated oligo (sequence, labels, direction)." + }, + "fluorophore": { + "type": "string", + "description": "Fluorophore name if identifiable; otherwise null." + }, + "quencher": { + "type": "string", + "description": "Quencher name if identifiable; otherwise null." + }, + "sense_antisense": { + "type": ["string", "null"], + "enum": ["sense", "antisense", null], + "description": "Sense/antisense designation inferred from probe name suffix (e.g., 's' or 'as')." + }, + "notes": { + "type": ["string", "null"], + "description": "Free-text notes about the probe (ambiguities, special chemistry)." + } + } + } + }, + + "oneOf": [ + { + "title": "Article with experiments/probes", + "type": "object", + "additionalProperties": false, + "required": ["doi", "abstract", "topic", "experiments", "extraction_report"], + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "abstract": { + "type": "string", + "minLength": 10, + "maxLength": 2000, + "description": "Abstract or summary as extracted." + }, + "topic": { + "type": "string", + "minLength": 2, + "maxLength": 100, + "description": "Short topic/category label (e.g., 'mutation scanning by DMA')." + }, + "experiments": { + "type": "array", + "description": "Each element corresponds to a target-probe pair (plus primers/related sequences) and the full experimental context.", + "minItems": 1, + "maxItems": 2000, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["id_exp", "raw_description", "type", "description", "metadata", "sequences","experiment_properties", "outcome", "pairing", "extraction_report"], + "properties": { + "id_exp": { + "type": "string", + "minLength": 1, + "maxLength": 120, + "description": "Unique experiment identifier (derive if needed from amplicon + probe name, e.g., 'N3-FAM-27-s')." + }, + "raw_description": { + "type": ["string", "null"], + "minLength": 1, + "maxLength": 1000, + "description": "Verbatim or lightly tidied description of the experiment from the article." + }, + "type": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Experiment type (e.g., 'DNA-RNA hybridization', 'real-time PCR', 'DMA')." + }, + "description": { + "type": "string", + "minLength": 10, + "maxLength": 1000, + "description": "Concise human-readable summary of this specific target-probe experiment." + }, + + "metadata": { + "type": "object", + "additionalProperties": false, + "description": "High-level descriptors linked to this experiment.", + "required": ["organism", "technology", "annealing", "pH", "rna_impurities"], + "properties": { + "organism": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Organism (e.g., 'human')." + }, + "technology": { + "type": ["string", "null"], + "minLength": 2, + "maxLength": 120, + "description": "Assay/technology label per article usage (e.g., 'real-time PCR', 'DMA')." + }, + "annealing": { + "type": ["object", "null"], + "additionalProperties": false, + "description": "Annealing process details, with optional quantitative and qualitative components.", + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Numeric representation (e.g., time or temperature), kept as raw + SI." + }, + "qualitative": { + "type": ["boolean", "null"], + "description": "If the article states a qualitative annealing outcome/criterion." + } + } + }, + "pH": { + "$ref": "#/$defs/measurement", + "description": "pH as raw text with optional parsed numeric; SI stored as dimensionless (same numeric value)." + }, + "rna_impurities": { + "type": ["object", "null"], + "additionalProperties": false, + "description": "RNA impurity information, if discussed.", + "required": ["quantitative", "qualitative"], + "properties": { + "quantitative": { + "$ref": "#/$defs/measurement", + "description": "Quantity/percentage of RNA impurities." + }, + "qualitative": { + "type": ["boolean", "null"], + "description": "Presence/absence or a qualitative statement regarding RNA impurities." + } + } + } + } + }, + + "sequences": { + "type": "object", + "additionalProperties": false, + "description": "All sequences relevant to this target-probe experiment.", + "required": ["target_sequence", "probe", "primer_sequences", "related_sequences"], + "properties": { + "target_sequence": { + "oneOf": [ + { "$ref": "#/$defs/decoratedOligo" }, + { "type": "null" } + ], + "description": "Target genomic sequence if explicitly given; store as decorated oligo only if labels are present; otherwise just sequence and length." + }, + "probe": { + "$ref": "#/$defs/probe", + "description": "The hybridization probe for this experiment." + }, + "primer_sequences": { + "oneOf": [ + { "$ref": "#/$defs/primerPair" }, + { "type": "null" } + ], + "description": "PCR primers associated with this experiment/amplicon if provided." + }, + "related_sequences": { + "type": "array", + "description": "Additional related sequences (controls, references), if any.", + "minItems": 0, + "maxItems": 50, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["related_sequence", "description"], + "properties": { + "related_sequence": { + "$ref": "#/$defs/decoratedOligo", + "description": "A related sequence (plain or decorated)." + }, + "description": { + "type": ["string", "null"], + "minLength": 1, + "maxLength": 200, + "description": "Short explanation of the related sequence's role." + } + } + } + } + } + }, + + "experiment_properties": { + "type": "object", + "additionalProperties": false, + "description": "Quantitative and buffer parameters for this experiment.", + "required": ["concentrations", "parameters_SI"], + "properties": { + "concentrations": { + "type": "object", + "additionalProperties": false, + "description": "Concentration-related values.", + "required": ["dna_rna_concentration", "concentration_SI"], + "properties": { + "dna_rna_concentration": { + "$ref": "#/$defs/measurement", + "description": "Analyte concentration as reported (raw) plus normalized SI (mol/m^3)." + }, + "concentration_SI": { + "$ref": "#/$defs/measurement", + "description": "Optional redundant SI-only concentration if the article already used SI; keep raw text synchronized." + } + } + }, + "parameters_SI": { + "type": "object", + "additionalProperties": false, + "description": "Assay buffer/condition parameters, represented as raw + SI. If any value is not present, fill-in measurements fields as null.", + "required": ["temperature", "Tris", "Na", "K", "Mg", "DMSO"], + "properties": { + "temperature": { + "$ref": "#/$defs/measurement", + "description": "Temperature (e.g., '58 °C'), with SI in Kelvin." + }, + "Tris": { + "$ref": "#/$defs/measurement", + "description": "Tris buffer concentration; SI in mol/m^3 (1 mM = 1 mol/m^3)." + }, + "Na": { + "$ref": "#/$defs/measurement", + "description": "Sodium ion concentration; SI in mol/m^3." + }, + "K": { + "$ref": "#/$defs/measurement", + "description": "Potassium ion concentration; SI in mol/m^3." + }, + "Mg": { + "$ref": "#/$defs/measurement", + "description": "Magnesium ion concentration; SI in mol/m^3." + }, + "DMSO": { + "$ref": "#/$defs/measurement", + "description": "DMSO amount (often % v/v); SI as dimensionless fraction (percent/100)." + } + } + } + } + }, + + "outcome": { + "type": "object", + "additionalProperties": false, + "description": "Results for this target-probe pairing.", + "required": ["outcome", "fluorescence", "comparative_notes"], + "properties": { + "outcome": { + "type": ["boolean", "null"], + "description": "Boolean result if explicitly stated (e.g., success/failure). If not explicit, leave null." + }, + "fluorescence": { + "$ref": "#/$defs/measurement", + "description": "Fluorescence or signal measurement (raw text + normalized form if numeric). If comparative only, keep statement in 'raw' and numeric fields null." + }, + "comparative_notes": { + "type": ["string", "null"], + "minLength": 0, + "maxLength": 500, + "description": "Comparative statements (e.g., 'N3-FAM stronger in real-time PCR; N3-Cy5 stronger in DMA')." + } + } + }, + + "pairing": { + "type": "object", + "additionalProperties": false, + "description": "Optional cross-references to paired/reciprocal probes within the same article.", + "required": ["paired_with_probe_name", "relationship"], + "properties": { + "paired_with_probe_name": { + "type": ["string", "null"], + "description": "Name of the other probe in a reciprocal comparison (e.g., 'N3-Cy5(27)s')." + }, + "relationship": { + "type": ["string", "null"], + "description": "Short label describing the relation (e.g., 'reciprocal comparison', 'same sequence different labels')." + } + } + }, + + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + } + }, + + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + }, + + { + "title": "Article with no hybridization probe sequences", + "type": "object", + "additionalProperties": false, + "required": ["doi", "explanation_why_does_not_this_article_have_any_hybridization_probes_sequences", "extraction_report"], + "properties": { + "doi": { + "type": "string", + "minLength": 4, + "maxLength": 100, + "description": "Digital Object Identifier for the article." + }, + "explanation_why_does_not_this_article_have_any_hybridization_probes_sequences": { + "type": "string", + "minLength": 50, + "maxLength": 2000, + "description": "A detailed justification straight from the article explaining the absence of probe sequences." + }, + "extraction_report": { "$ref": "#/$defs/extractionReport" } + } + } + ] +} diff --git a/pipeline.py b/pipeline.py index 9dd9726..a98b7ec 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # Pipeline: PROBESt---- # 1. Initial set generation diff --git a/scripts/articles/fetch_article_text.py b/scripts/articles/fetch_article_text.py index ac9090b..a3e4b19 100644 --- a/scripts/articles/fetch_article_text.py +++ b/scripts/articles/fetch_article_text.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import sys import requests diff --git a/scripts/benchmarking/test_data_gen.py b/scripts/benchmarking/test_data_gen.py index 20e2d1a..76207c3 100644 --- a/scripts/benchmarking/test_data_gen.py +++ b/scripts/benchmarking/test_data_gen.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import argparse import json import pandas as pd diff --git a/scripts/databases/generate_noisy_probes.py b/scripts/databases/generate_noisy_probes.py index 715beba..6c90d85 100644 --- a/scripts/databases/generate_noisy_probes.py +++ b/scripts/databases/generate_noisy_probes.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd import numpy as np diff --git a/scripts/databases/probeBase.py b/scripts/databases/probeBase.py index 25a2daf..89b6871 100644 --- a/scripts/databases/probeBase.py +++ b/scripts/databases/probeBase.py @@ -1,60 +1,86 @@ -import requests -from bs4 import BeautifulSoup -import pandas as pd - - -def parse_probebase_page(url: str) -> pd.Series: - """Parse one page from ProbeBase database to the uniform format - - Parameters - ---------- - url : str - URL string, path to the probebase page - - Returns - ------- - table : pd.DataFrame - parsed table from probebase - """ - - # Download html and get table - - response = requests.get(url) - - # Response checking - if response.status_code == 200: - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(response.content, 'html.parser') - - # if table is empty - if soup.find_all('tr') == []: # CORRECT!!!!!!!!!!!!!!!!!!! - return Warning('Page without probe data.frame or parsing problems') - - # Create a list to hold the rows of the table - table_data = [] - - # Loop through each row in the table - for row in soup.find_all('tr'): - # print(row, "###############") - cells = row.find_all(['td', 'th', 'value']) - row_data = [cell.get_text(strip=True) for cell in cells] - table_data.append(row_data) - - # Convert the list of rows into a DataFrame for easier manipulation - df = pd.DataFrame(table_data) - - # Check if we have enough columns - if df.shape[1] < 2: - return Warning('Table has insufficient columns') - - df.iloc[0,0] = 'Test' - df.iloc[1,0] = 'Name' - - df2 = df.iloc[:,1] - df2.index = df.iloc[:,0] - - # Display the result - return df2 - - else: +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import requests +from bs4 import BeautifulSoup +import pandas as pd + + +def parse_probebase_page(url: str) -> pd.Series: + """Parse one page from ProbeBase database to the uniform format + + Parameters + ---------- + url : str + URL string, path to the probebase page + + Returns + ------- + table : pd.DataFrame + parsed table from probebase + """ + + # Download html and get table + + response = requests.get(url) + + # Response checking + if response.status_code == 200: + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # if table is empty + if soup.find_all('tr') == []: # CORRECT!!!!!!!!!!!!!!!!!!! + return Warning('Page without probe data.frame or parsing problems') + + # Create a list to hold the rows of the table + table_data = [] + + # Loop through each row in the table + for row in soup.find_all('tr'): + # print(row, "###############") + cells = row.find_all(['td', 'th', 'value']) + row_data = [cell.get_text(strip=True) for cell in cells] + table_data.append(row_data) + + # Convert the list of rows into a DataFrame for easier manipulation + df = pd.DataFrame(table_data) + + # Check if we have enough columns + if df.shape[1] < 2: + return Warning('Table has insufficient columns') + + df.iloc[0,0] = 'Test' + df.iloc[1,0] = 'Name' + + df2 = df.iloc[:,1] + df2.index = df.iloc[:,0] + + # Display the result + return df2 + + else: return ImportWarning(f"Failed to retrieve the page. Status code: {response.status_code}") \ No newline at end of file diff --git a/scripts/databases/probeBase_parse.py b/scripts/databases/probeBase_parse.py index 6667c99..fa91f97 100644 --- a/scripts/databases/probeBase_parse.py +++ b/scripts/databases/probeBase_parse.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd from tqdm import tqdm import re diff --git a/scripts/databases/probeBase_wide.py b/scripts/databases/probeBase_wide.py index 196eedd..9c5f6f4 100644 --- a/scripts/databases/probeBase_wide.py +++ b/scripts/databases/probeBase_wide.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pandas as pd # Read the CSV file diff --git a/scripts/generator/ML_filtration.py b/scripts/generator/ML_filtration.py index 1e978ab..d0a9e52 100644 --- a/scripts/generator/ML_filtration.py +++ b/scripts/generator/ML_filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import pandas as pd import numpy as np diff --git a/scripts/generator/probe_filt.py b/scripts/generator/probe_filt.py index c2963b7..9fcecd6 100644 --- a/scripts/generator/probe_filt.py +++ b/scripts/generator/probe_filt.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # imports import sys import numpy as np diff --git a/scripts/grid_search/test_parameters.py b/scripts/grid_search/test_parameters.py index 7259b83..421b64f 100644 --- a/scripts/grid_search/test_parameters.py +++ b/scripts/grid_search/test_parameters.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + ''' Script for running the main pipeline with a given parameters grid to collect hit statistics. ''' diff --git a/scripts/loop_generation/download_genomes.py b/scripts/loop_generation/download_genomes.py index e79f104..10795d2 100644 --- a/scripts/loop_generation/download_genomes.py +++ b/scripts/loop_generation/download_genomes.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Download genomes for bacterial species from NCBI. Downloads up to 100 genomes per species. diff --git a/scripts/loop_generation/merge_outputs.py b/scripts/loop_generation/merge_outputs.py index 79fcc97..fbdf8dc 100644 --- a/scripts/loop_generation/merge_outputs.py +++ b/scripts/loop_generation/merge_outputs.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Merge modeling outputs from all species into a single file with species_name column. """ diff --git a/scripts/validation/pdf_to_seq.py b/scripts/validation/pdf_to_seq.py index f30ce12..28408cb 100644 --- a/scripts/validation/pdf_to_seq.py +++ b/scripts/validation/pdf_to_seq.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from pdfminer.high_level import extract_text import re import argparse diff --git a/scripts/validation/validation.py b/scripts/validation/validation.py index db93242..b7af24d 100644 --- a/scripts/validation/validation.py +++ b/scripts/validation/validation.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from jsonschema import validate, ValidationError import json import argparse diff --git a/setup.py b/setup.py index ff0b081..bcd5bc3 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from setuptools import setup, find_packages import os diff --git a/src/PROBESt/AI.py b/src/PROBESt/AI.py index 5cf8278..6fe99fb 100644 --- a/src/PROBESt/AI.py +++ b/src/PROBESt/AI.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression diff --git a/src/PROBESt/__init__.py b/src/PROBESt/__init__.py index 3b952fd..b283e9c 100644 --- a/src/PROBESt/__init__.py +++ b/src/PROBESt/__init__.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """PROBESt package.""" from . import genome_operations diff --git a/src/PROBESt/args.py b/src/PROBESt/args.py index e696f9b..a1deb6b 100644 --- a/src/PROBESt/args.py +++ b/src/PROBESt/args.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import argparse def arguments_parse(): diff --git a/src/PROBESt/bash_wrappers.py b/src/PROBESt/bash_wrappers.py index 1b46d0d..f03e2e2 100644 --- a/src/PROBESt/bash_wrappers.py +++ b/src/PROBESt/bash_wrappers.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import subprocess def uniline_fasta(args, out): diff --git a/src/PROBESt/check_probe_pdf.py b/src/PROBESt/check_probe_pdf.py index 5c6813e..8a8098c 100644 --- a/src/PROBESt/check_probe_pdf.py +++ b/src/PROBESt/check_probe_pdf.py @@ -1,4 +1,30 @@ #!/usr/bin/env python3 +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ PDF Nucleotide Sequence Checker diff --git a/src/PROBESt/dedegeneration.py b/src/PROBESt/dedegeneration.py index a0c001b..11cd3d3 100644 --- a/src/PROBESt/dedegeneration.py +++ b/src/PROBESt/dedegeneration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ De-degeneration module for PROBESt. diff --git a/src/PROBESt/evolution.py b/src/PROBESt/evolution.py index 9d9b8ce..1c1ccf8 100644 --- a/src/PROBESt/evolution.py +++ b/src/PROBESt/evolution.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import random diff --git a/src/PROBESt/filtration.py b/src/PROBESt/filtration.py index 6a749fc..30f5652 100644 --- a/src/PROBESt/filtration.py +++ b/src/PROBESt/filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import pandas as pd import numpy as np diff --git a/src/PROBESt/genome_operations.py b/src/PROBESt/genome_operations.py index 1e2c8d6..2b3999f 100644 --- a/src/PROBESt/genome_operations.py +++ b/src/PROBESt/genome_operations.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Module for genome operations including fetching, BLAST search and parsing.""" from Bio import Entrez, SeqIO diff --git a/src/PROBESt/merge.py b/src/PROBESt/merge.py index 600fcc4..16f84bb 100644 --- a/src/PROBESt/merge.py +++ b/src/PROBESt/merge.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import subprocess from shutil import copyfile diff --git a/src/PROBESt/misc.py b/src/PROBESt/misc.py index 0966726..0ae014d 100644 --- a/src/PROBESt/misc.py +++ b/src/PROBESt/misc.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import os import re import subprocess diff --git a/src/PROBESt/modeling.py b/src/PROBESt/modeling.py index 0c22b13..7b8400f 100644 --- a/src/PROBESt/modeling.py +++ b/src/PROBESt/modeling.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Modeling module for PROBESt pipeline. diff --git a/src/PROBESt/models_registry.py b/src/PROBESt/models_registry.py index 4c74497..68dd262 100644 --- a/src/PROBESt/models_registry.py +++ b/src/PROBESt/models_registry.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import torch import torch.nn as nn from torch.nn import functional as F diff --git a/src/PROBESt/oligominer.py b/src/PROBESt/oligominer.py index b1ddc3b..f492c3f 100644 --- a/src/PROBESt/oligominer.py +++ b/src/PROBESt/oligominer.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import subprocess diff --git a/src/PROBESt/prepare_blast.py b/src/PROBESt/prepare_blast.py index d0e2a56..281d4f2 100644 --- a/src/PROBESt/prepare_blast.py +++ b/src/PROBESt/prepare_blast.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """ Module for preparing BLAST databases from FASTA directories. diff --git a/src/PROBESt/primer3.py b/src/PROBESt/primer3.py index 263a314..bde344b 100644 --- a/src/PROBESt/primer3.py +++ b/src/PROBESt/primer3.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + from Bio import SeqIO import os import subprocess diff --git a/src/PROBESt/probe_alignment_profiler.py b/src/PROBESt/probe_alignment_profiler.py index d226ace..5495465 100644 --- a/src/PROBESt/probe_alignment_profiler.py +++ b/src/PROBESt/probe_alignment_profiler.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Module for analyzing BLAST alignments of probe-target sequence pairs. This module provides functionality to process BLAST alignment results and calculate diff --git a/src/PROBESt/rna_structure.py b/src/PROBESt/rna_structure.py index 53ebbc5..004cd19 100644 --- a/src/PROBESt/rna_structure.py +++ b/src/PROBESt/rna_structure.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import RNA from typing import Union, Tuple, Optional diff --git a/src/PROBESt/tokenization.py b/src/PROBESt/tokenization.py index 242c06f..97b1ae2 100644 --- a/src/PROBESt/tokenization.py +++ b/src/PROBESt/tokenization.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tokenization module for DNA sequences. This module provides functions to tokenize DNA sequences into k-mers, diff --git a/src/conf.py b/src/conf.py index f3b490b..31db77f 100644 --- a/src/conf.py +++ b/src/conf.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + # Configuration file for the Sphinx documentation builder. # # For the full list of built-in configuration values, see the documentation: diff --git a/tests/PROBESt/test_AI.py b/tests/PROBESt/test_AI.py index fa9246d..9b76932 100644 --- a/tests/PROBESt/test_AI.py +++ b/tests/PROBESt/test_AI.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_dedegeneration.py b/tests/PROBESt/test_dedegeneration.py index d64f5a2..76abfd4 100644 --- a/tests/PROBESt/test_dedegeneration.py +++ b/tests/PROBESt/test_dedegeneration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for the de-degeneration module.""" import pytest diff --git a/tests/PROBESt/test_filtration.py b/tests/PROBESt/test_filtration.py index c5c8207..bbd2f1d 100644 --- a/tests/PROBESt/test_filtration.py +++ b/tests/PROBESt/test_filtration.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_genome_operations.py b/tests/PROBESt/test_genome_operations.py index fbeaa92..28f6c7d 100644 --- a/tests/PROBESt/test_genome_operations.py +++ b/tests/PROBESt/test_genome_operations.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for genome operations module.""" import pytest diff --git a/tests/PROBESt/test_merge.py b/tests/PROBESt/test_merge.py index 40b5329..462b308 100644 --- a/tests/PROBESt/test_merge.py +++ b/tests/PROBESt/test_merge.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import subprocess diff --git a/tests/PROBESt/test_misc.py b/tests/PROBESt/test_misc.py index f256a89..58e9f02 100644 --- a/tests/PROBESt/test_misc.py +++ b/tests/PROBESt/test_misc.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import pandas as pd import numpy as np diff --git a/tests/PROBESt/test_modeling.py b/tests/PROBESt/test_modeling.py index 1f36696..073919a 100644 --- a/tests/PROBESt/test_modeling.py +++ b/tests/PROBESt/test_modeling.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for modeling module.""" import pytest diff --git a/tests/PROBESt/test_oligominer.py b/tests/PROBESt/test_oligominer.py index 62056b5..1483e18 100644 --- a/tests/PROBESt/test_oligominer.py +++ b/tests/PROBESt/test_oligominer.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import sys diff --git a/tests/PROBESt/test_prepare_blast.py b/tests/PROBESt/test_prepare_blast.py index 8f62e43..be9be01 100644 --- a/tests/PROBESt/test_prepare_blast.py +++ b/tests/PROBESt/test_prepare_blast.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for prepare_blast module.""" import pytest diff --git a/tests/PROBESt/test_primer3.py b/tests/PROBESt/test_primer3.py index 4ddca8a..ada2de6 100644 --- a/tests/PROBESt/test_primer3.py +++ b/tests/PROBESt/test_primer3.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest import os import sys diff --git a/tests/PROBESt/test_probe_alignment_profiler.py b/tests/PROBESt/test_probe_alignment_profiler.py index c322df9..8616ccd 100644 --- a/tests/PROBESt/test_probe_alignment_profiler.py +++ b/tests/PROBESt/test_probe_alignment_profiler.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Tests for probe alignment profiler.""" import pytest diff --git a/tests/PROBESt/test_rna_structure.py b/tests/PROBESt/test_rna_structure.py index 9abbb47..327ff6f 100644 --- a/tests/PROBESt/test_rna_structure.py +++ b/tests/PROBESt/test_rna_structure.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import pytest from src.PROBESt.rna_structure import calculate_hairpin_prob, calculate_dimer_G, get_reverse_complement diff --git a/tests/__init__.py b/tests/__init__.py index df825a5..06bd778 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,27 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Test package for PROBESt.""" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index ecd143a..122c259 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + """Common pytest configurations and fixtures.""" import pytest diff --git a/tests/database/test_probebase.py b/tests/database/test_probebase.py index 193f189..ddc513d 100644 --- a/tests/database/test_probebase.py +++ b/tests/database/test_probebase.py @@ -1,64 +1,90 @@ -"""Tests for ProbeBase database parsing.""" - -import pytest -import pandas as pd -from unittest.mock import patch, MagicMock -from scripts.databases.probeBase import parse_probebase_page - -@pytest.fixture -def mock_response(): - """Create a mock response object.""" - mock = MagicMock() - mock.status_code = 200 - return mock - -@patch('requests.get') -def test_response_problem(mock_get, mock_response): - """Test page with response problem.""" - # Setup mock response - mock_response.content = b"Error page" - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe" - resp = parse_probebase_page(data) - assert isinstance(resp, Warning) - -@patch('requests.get') -def test_response_empty(mock_get, mock_response): - """Test page with empty table.""" - # Setup mock response with empty table - mock_response.content = b""" - - - - - -
Header
No data
- - - """ - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe/1" - resp = parse_probebase_page(data) - assert isinstance(resp, Warning) - -@patch('requests.get') -def test_response_content(mock_get, mock_response): - """Test page with content.""" - # Setup mock response with valid table data - mock_response.content = b""" - - - - - -
Probe IDSequence
1ATGC
- - - """ - mock_get.return_value = mock_response - - data = "https://probebase.csb.univie.ac.at/pb_report/probe/2" - resp = parse_probebase_page(data) +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +"""Tests for ProbeBase database parsing.""" + +import pytest +import pandas as pd +from unittest.mock import patch, MagicMock +from scripts.databases.probeBase import parse_probebase_page + +@pytest.fixture +def mock_response(): + """Create a mock response object.""" + mock = MagicMock() + mock.status_code = 200 + return mock + +@patch('requests.get') +def test_response_problem(mock_get, mock_response): + """Test page with response problem.""" + # Setup mock response + mock_response.content = b"Error page" + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe" + resp = parse_probebase_page(data) + assert isinstance(resp, Warning) + +@patch('requests.get') +def test_response_empty(mock_get, mock_response): + """Test page with empty table.""" + # Setup mock response with empty table + mock_response.content = b""" + + + + + +
Header
No data
+ + + """ + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe/1" + resp = parse_probebase_page(data) + assert isinstance(resp, Warning) + +@patch('requests.get') +def test_response_content(mock_get, mock_response): + """Test page with content.""" + # Setup mock response with valid table data + mock_response.content = b""" + + + + + +
Probe IDSequence
1ATGC
+ + + """ + mock_get.return_value = mock_response + + data = "https://probebase.csb.univie.ac.at/pb_report/probe/2" + resp = parse_probebase_page(data) assert isinstance(resp, pd.Series) \ No newline at end of file diff --git a/tests/scripts/fasta2table.py b/tests/scripts/fasta2table.py index 241f5b2..aa30fe7 100644 --- a/tests/scripts/fasta2table.py +++ b/tests/scripts/fasta2table.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess diff --git a/tests/scripts/prep_db.py b/tests/scripts/prep_db.py index 728c00f..90b70a1 100644 --- a/tests/scripts/prep_db.py +++ b/tests/scripts/prep_db.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess diff --git a/tests/scripts/probe_check.py b/tests/scripts/probe_check.py index f78621a..085d0fe 100644 --- a/tests/scripts/probe_check.py +++ b/tests/scripts/probe_check.py @@ -1,3 +1,29 @@ +# MIT License +# +# Copyright (c) 2025 CTLab-ITMO +# +# Authors: Daniil Smutin, Aleksandr Serdiukov, Vitalii Dravgelis, Artem Ivanov, +# Aleksei Zabashta, Sergey Muravyov, and the CTLab-ITMO university team. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + import unittest import os import subprocess