From 79e5f320bca607031f7d04944b8146da60ca4402 Mon Sep 17 00:00:00 2001 From: nextstrain-bot <78992647+nextstrain-bot@users.noreply.github.com> Date: Tue, 14 Apr 2026 22:15:26 +0000 Subject: [PATCH 1/2] git subrepo pull (merge) --force ingest/vendored subrepo: subdir: "ingest/vendored" merged: "c29898f" upstream: origin: "https://github.com/nextstrain/ingest" branch: "main" commit: "c29898f" git-subrepo: version: "0.4.9" origin: "https://github.com/ingydotnet/git-subrepo" commit: "5e0f401" --- ingest/vendored/.github/dependabot.yml | 2 +- ingest/vendored/.github/workflows/ci.yaml | 2 +- .../.github/workflows/pre-commit.yaml | 4 +- ingest/vendored/.gitrepo | 6 +- ingest/vendored/README.md | 70 ++++---- ingest/vendored/notify-slack | 56 ------ ingest/vendored/scripts/assign-colors | 96 +++++++++++ .../{ => scripts}/cloudfront-invalidate | 0 .../vendored/{ => scripts}/download-from-s3 | 0 .../{ => scripts}/fetch-from-ncbi-entrez | 0 ingest/vendored/{ => scripts}/notify-on-diff | 0 .../vendored/{ => scripts}/notify-on-job-fail | 0 .../{ => scripts}/notify-on-job-start | 0 .../{ => scripts}/notify-on-record-change | 0 ingest/vendored/scripts/notify-slack | 93 ++++++++++ .../vendored/{ => scripts}/s3-object-exists | 0 ingest/vendored/{ => scripts}/sha256sum | 0 ingest/vendored/{ => scripts}/trigger | 0 .../{ => scripts}/trigger-on-new-data | 0 ingest/vendored/{ => scripts}/upload-to-s3 | 0 ingest/vendored/snakemake/config.smk | 156 +++++++++++++++++ ingest/vendored/snakemake/remote_files.smk | 159 ++++++++++++++++++ 22 files changed, 551 insertions(+), 93 deletions(-) delete mode 100755 ingest/vendored/notify-slack create mode 100755 ingest/vendored/scripts/assign-colors rename ingest/vendored/{ => scripts}/cloudfront-invalidate (100%) rename ingest/vendored/{ => scripts}/download-from-s3 (100%) rename ingest/vendored/{ => scripts}/fetch-from-ncbi-entrez (100%) rename ingest/vendored/{ => scripts}/notify-on-diff (100%) rename ingest/vendored/{ => scripts}/notify-on-job-fail (100%) rename ingest/vendored/{ => scripts}/notify-on-job-start (100%) rename ingest/vendored/{ => scripts}/notify-on-record-change (100%) create mode 100755 ingest/vendored/scripts/notify-slack rename ingest/vendored/{ => scripts}/s3-object-exists (100%) rename ingest/vendored/{ => scripts}/sha256sum (100%) rename ingest/vendored/{ => scripts}/trigger (100%) rename ingest/vendored/{ => scripts}/trigger-on-new-data (100%) rename ingest/vendored/{ => scripts}/upload-to-s3 (100%) create mode 100644 ingest/vendored/snakemake/config.smk create mode 100644 ingest/vendored/snakemake/remote_files.smk diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml index 89bd084..0a50ee1 100644 --- a/ingest/vendored/.github/dependabot.yml +++ b/ingest/vendored/.github/dependabot.yml @@ -4,7 +4,7 @@ # Each ecosystem is checked on a scheduled interval defined below. To trigger # a check manually, go to # -# https://github.com/nextstrain/ingest/network/updates +# https://github.com/nextstrain/shared/network/updates # # and look for a "Check for updates" button. You may need to click around a # bit first. diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml index c716277..6dba82b 100644 --- a/ingest/vendored/.github/workflows/ci.yaml +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -11,5 +11,5 @@ jobs: shellcheck: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: nextstrain/.github/actions/shellcheck@master diff --git a/ingest/vendored/.github/workflows/pre-commit.yaml b/ingest/vendored/.github/workflows/pre-commit.yaml index 70da533..c63b890 100644 --- a/ingest/vendored/.github/workflows/pre-commit.yaml +++ b/ingest/vendored/.github/workflows/pre-commit.yaml @@ -7,8 +7,8 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: "3.12" - uses: pre-commit/action@v3.0.1 diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo index 544d60f..53c414e 100644 --- a/ingest/vendored/.gitrepo +++ b/ingest/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/ingest branch = main - commit = 258ab8ce898a88089bc88caee336f8d683a0e79a - parent = a7af2c05fc4ccc822c8ef38f0001dc5e8bee803b + commit = c29898f7c32c3f85d65db235d23a78e776f89120 + parent = 2d06e5de2f761a090d45e0bfcb8b1d510fffdc83 method = merge - cmdver = 0.4.7 + cmdver = 0.4.9 diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md index a2b54cb..d4b1874 100644 --- a/ingest/vendored/README.md +++ b/ingest/vendored/README.md @@ -1,6 +1,6 @@ -# ingest +# shared -Shared internal tooling for pathogen data ingest. Used by our individual +Shared internal tooling for pathogen workflows. Used by our individual pathogen repos which produce Nextstrain builds. Expected to be vendored by each pathogen repo using `git subrepo`. @@ -9,47 +9,47 @@ Some tools may only live here temporarily before finding a permanent home in ## Vendoring -Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts. -(See discussion on this decision in https://github.com/nextstrain/ingest/issues/3) +Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor shared scripts. +(See discussion on this decision in https://github.com/nextstrain/shared/issues/3) For a list of Nextstrain repos that are currently using this method, use [this GitHub code search](https://github.com/search?type=code&q=org%3Anextstrain+subrepo+%22remote+%3D+https%3A%2F%2Fgithub.com%2Fnextstrain%2Fingest%22). If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation). -Then add the latest ingest scripts to the pathogen repo by running: +Then add the latest shared scripts to the pathogen repo by running: ``` -git subrepo clone https://github.com/nextstrain/ingest ingest/vendored +git subrepo clone https://github.com/nextstrain/shared shared/vendored ``` -Any future updates of ingest scripts can be pulled in with: +Any future updates of shared scripts can be pulled in with: ``` -git subrepo pull ingest/vendored +git subrepo pull shared/vendored ``` If you run into merge conflicts and would like to pull in a fresh copy of the -latest ingest scripts, pull with the `--force` flag: +latest shared scripts, pull with the `--force` flag: ``` -git subrepo pull ingest/vendored --force +git subrepo pull shared/vendored --force ``` > **Warning** > Beware of rebasing/dropping the parent commit of a `git subrepo` update -`git subrepo` relies on metadata in the `ingest/vendored/.gitrepo` file, +`git subrepo` relies on metadata in the `shared/vendored/.gitrepo` file, which includes the hash for the parent commit in the pathogen repos. If this hash no longer exists in the commit history, there will be errors when running future `git subrepo pull` commands. If you run into an error similar to the following: ``` -$ git subrepo pull ingest/vendored -git-subrepo: Command failed: 'git branch subrepo/ingest/vendored '. +$ git subrepo pull shared/vendored +git-subrepo: Command failed: 'git branch subrepo/shared/vendored '. fatal: not a valid object name: '' ``` -Check the parent commit hash in the `ingest/vendored/.gitrepo` file and make +Check the parent commit hash in the `shared/vendored/.gitrepo` file and make sure the commit exists in the commit history. Update to the appropriate parent commit hash if needed. @@ -84,39 +84,49 @@ approach to "ingest" has been discussed in various internal places, including: ## Scripts -Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools. +Scripts for supporting workflow automation that don’t really belong in any of our existing tools. -- [notify-on-diff](notify-on-diff) - Send Slack message with diff of a local file and an S3 object -- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch -- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch -- [notify-on-record-change](notify-on-recod-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`. +- [assign-colors](scripts/assign-colors) - Generate colors.tsv for augur export based on ordering, color schemes, and what exists in the metadata. Used in the phylogenetic or nextclade workflows. +- [notify-on-diff](scripts/notify-on-diff) - Send Slack message with diff of a local file and an S3 object +- [notify-on-job-fail](scripts/notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch +- [notify-on-job-start](scripts/notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch +- [notify-on-record-change](scripts/notify-on-record-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`. If the S3 object's metadata does not have `recordcount`, then will attempt to download S3 object to count lines locally, which only supports `xz` compressed S3 objects. -- [notify-slack](notify-slack) - Send message or file to Slack -- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts -- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events. -- [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message` +- [notify-slack](scripts/notify-slack) - Send message or file to Slack +- [s3-object-exists](scripts/s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts +- [trigger](scripts/trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events. +- [trigger-on-new-data](scripts/trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message` A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated. + NCBI interaction scripts that are useful for fetching public metadata and sequences. -- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. +- [fetch-from-ncbi-entrez](scripts/fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. -Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18. +Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/shared/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/shared/issues/18. Potential Nextstrain CLI scripts -- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. -- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104). +- [sha256sum](scripts/sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. +- [cloudfront-invalidate](scripts/cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104). This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script. -- [upload-to-s3](upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL. +- [upload-to-s3](scripts/upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL. Skips upload if the local file's hash is identical to the S3 object's metadata `sha256sum`. Adds the following user defined metadata to uploaded S3 object: - - `sha256sum` - hash of the file generated by [sha256sum](sha256sum) + - `sha256sum` - hash of the file generated by [sha256sum](scripts/sha256sum) - `recordcount` - the line count of the file -- [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. +- [download-from-s3](scripts/download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`. +## Snakemake + +Snakemake workflow functions that are shared across many pathogen workflows that don’t really belong in any of our existing tools. + +- [config.smk](snakemake/config.smk) - Shared functions for handling workflow configs. +- [remote_files.smk](snakemake/remote_files.smk) - Exposes the `path_or_url` function which will use Snakemake's storage plugins to download/upload files to remote providers as needed. + + ## Software requirements Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date. diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack deleted file mode 100755 index a343435..0000000 --- a/ingest/vendored/notify-slack +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" -: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" - -upload=0 -output=/dev/null -thread_ts="" -broadcast=0 -args=() - -for arg; do - case "$arg" in - --upload) - upload=1;; - --output=*) - output="${arg#*=}";; - --thread-ts=*) - thread_ts="${arg#*=}";; - --broadcast) - broadcast=1;; - *) - args+=("$arg");; - esac -done - -set -- "${args[@]}" - -text="${1:?Some message text is required.}" - -if [[ "$upload" == 1 ]]; then - echo "Uploading data to Slack with the message: $text" - curl https://slack.com/api/files.upload \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channels="$SLACK_CHANNELS" \ - --form-string title="$text" \ - --form-string filename="$text" \ - --form-string thread_ts="$thread_ts" \ - --form file=@/dev/stdin \ - --form filetype=text \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -else - echo "Posting Slack message: $text" - curl https://slack.com/api/chat.postMessage \ - --header "Authorization: Bearer $SLACK_TOKEN" \ - --form-string channel="$SLACK_CHANNELS" \ - --form-string text="$text" \ - --form-string thread_ts="$thread_ts" \ - --form-string reply_broadcast="$broadcast" \ - --fail --silent --show-error \ - --http1.1 \ - --output "$output" -fi diff --git a/ingest/vendored/scripts/assign-colors b/ingest/vendored/scripts/assign-colors new file mode 100755 index 0000000..e42a44d --- /dev/null +++ b/ingest/vendored/scripts/assign-colors @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +""" +Generate colors.tsv for augur export based on ordering, color schemes, and +traits that exists in the metadata. +""" +import argparse +import pandas as pd + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Assign colors based on defined ordering of traits.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument('--ordering', type=str, required=True, + help="""Input TSV file defining the color ordering where the first + column is the field and the second column is the trait in that field. + Blank lines are ignored. Lines starting with '#' will be ignored as comments.""") + parser.add_argument('--color-schemes', type=str, required=True, + help="Input color schemes where each line is a different color scheme separated by tabs.") + parser.add_argument('--metadata', type=str, + help="""If provided, restrict colors to only those traits found in + metadata. If the metadata includes a 'focal' column that only contains + boolean values, then restrict colors to traits for rows where 'focal' + is set to True.""") + parser.add_argument('--output', type=str, required=True, + help="Output colors TSV file to be passed to augur export.") + args = parser.parse_args() + + assignment = {} + with open(args.ordering) as f: + for line in f.readlines(): + array = line.strip().split("\t") + # Ignore empty lines or commented lines + if not array or not array[0] or array[0].startswith('#'): + continue + # Throw a warning if encountering a line not matching the expected number of columns, ignore line + elif len(array)!=2: + print(f"WARNING: Could not decode color ordering line: {line}") + continue + # Otherwise, process color ordering where we expect 2 columns: name, traits + else: + name = array[0] + trait = array[1] + if name not in assignment: + assignment[name] = [trait] + else: + assignment[name].append(trait) + + # if metadata supplied, go through and + # 1. remove assignments that don't exist in metadata + # 2. remove assignments that have 'focal' set to 'False' in metadata + if args.metadata: + metadata = pd.read_csv(args.metadata, delimiter='\t') + for name, trait in assignment.items(): + if name in metadata: + if 'focal' in metadata and metadata['focal'].dtype == 'bool': + focal_list = metadata.loc[metadata['focal'], name].unique() + subset_focal = [x for x in assignment[name] if x in focal_list] + assignment[name] = subset_focal + else: # no 'focal' present + subset_present = [x for x in assignment[name] if x in metadata[name].unique()] + assignment[name] = subset_present + + + schemes = {} + counter = 0 + with open(args.color_schemes) as f: + for line in f.readlines(): + counter += 1 + array = line.lstrip().rstrip().split("\t") + schemes[counter] = array + + with open(args.output, 'w') as f: + for trait_name, trait_array in assignment.items(): + if len(trait_array)==0: + print(f"No traits found for {trait_name}") + continue + if len(schemes)0): + if (remain>len(schemes)): + color_array = [*color_array, *schemes[len(schemes)]] + remain -= len(schemes) + else: + color_array = [*color_array, *schemes[remain]] + remain = 0 + else: + color_array = schemes[len(trait_array)] + + zipped = list(zip(trait_array, color_array)) + for trait_value, color in zipped: + f.write(trait_name + "\t" + trait_value + "\t" + color + "\n") + f.write("\n") diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/scripts/cloudfront-invalidate similarity index 100% rename from ingest/vendored/cloudfront-invalidate rename to ingest/vendored/scripts/cloudfront-invalidate diff --git a/ingest/vendored/download-from-s3 b/ingest/vendored/scripts/download-from-s3 similarity index 100% rename from ingest/vendored/download-from-s3 rename to ingest/vendored/scripts/download-from-s3 diff --git a/ingest/vendored/fetch-from-ncbi-entrez b/ingest/vendored/scripts/fetch-from-ncbi-entrez similarity index 100% rename from ingest/vendored/fetch-from-ncbi-entrez rename to ingest/vendored/scripts/fetch-from-ncbi-entrez diff --git a/ingest/vendored/notify-on-diff b/ingest/vendored/scripts/notify-on-diff similarity index 100% rename from ingest/vendored/notify-on-diff rename to ingest/vendored/scripts/notify-on-diff diff --git a/ingest/vendored/notify-on-job-fail b/ingest/vendored/scripts/notify-on-job-fail similarity index 100% rename from ingest/vendored/notify-on-job-fail rename to ingest/vendored/scripts/notify-on-job-fail diff --git a/ingest/vendored/notify-on-job-start b/ingest/vendored/scripts/notify-on-job-start similarity index 100% rename from ingest/vendored/notify-on-job-start rename to ingest/vendored/scripts/notify-on-job-start diff --git a/ingest/vendored/notify-on-record-change b/ingest/vendored/scripts/notify-on-record-change similarity index 100% rename from ingest/vendored/notify-on-record-change rename to ingest/vendored/scripts/notify-on-record-change diff --git a/ingest/vendored/scripts/notify-slack b/ingest/vendored/scripts/notify-slack new file mode 100755 index 0000000..c6f1a87 --- /dev/null +++ b/ingest/vendored/scripts/notify-slack @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" +: "${SLACK_CHANNELS:?The SLACK_CHANNELS environment variable is required.}" + +upload=0 +output=/dev/null +thread_ts="" +broadcast=0 +fail_on_error=0 +args=() + +for arg; do + case "$arg" in + --upload) + upload=1;; + --output=*) + output="${arg#*=}";; + --thread-ts=*) + thread_ts="${arg#*=}";; + --broadcast) + broadcast=1;; + --fail-on-error) + fail_on_error=1;; + *) + args+=("$arg");; + esac +done + +set -- "${args[@]}" + +text="${1:?Some message text is required.}" + +send_slack_message() { + if [[ "$upload" == 1 ]]; then + echo "Uploading data to Slack with the message: $text" + + upload_file="$(mktemp -t upload-file-XXXXXX)" + trap "rm -f '$upload_file'" EXIT + + cat /dev/stdin > "$upload_file" + # printf used to strip whitespace from output of macOS/BSD wc + # See + length=$(printf '%d' "$(<"$upload_file" wc -c)") + + upload_info=$(curl https://slack.com/api/files.getUploadURLExternal \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string filename="$text" \ + --form-string length="$length" \ + --fail --silent --show-error \ + --http1.1 ) + + upload_url="$(jq -r .upload_url <<< "$upload_info")" + curl "$upload_url" \ + --form-string filename="$text" \ + --form file="@$upload_file" \ + --fail --silent --show-error \ + --http1.1 > /dev/null + + files_uploaded="$(jq -r "[{id: .file_id}]" <<< "$upload_info")" + curl -X POST https://slack.com/api/files.completeUploadExternal \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channel_id="$SLACK_CHANNELS" \ + --form-string thread_ts="$thread_ts" \ + --form-string files="$files_uploaded" \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" + + else + echo "Posting Slack message: $text" + curl https://slack.com/api/chat.postMessage \ + --header "Authorization: Bearer $SLACK_TOKEN" \ + --form-string channel="$SLACK_CHANNELS" \ + --form-string text="$text" \ + --form-string thread_ts="$thread_ts" \ + --form-string reply_broadcast="$broadcast" \ + --fail --silent --show-error \ + --http1.1 \ + --output "$output" + fi +} + +if ! send_slack_message; then + if [[ "$fail_on_error" == 1 ]]; then + echo "Sending Slack message failed" + exit 1 + else + echo "Sending Slack message failed, but exiting with success anyway." + exit 0 + fi +fi diff --git a/ingest/vendored/s3-object-exists b/ingest/vendored/scripts/s3-object-exists similarity index 100% rename from ingest/vendored/s3-object-exists rename to ingest/vendored/scripts/s3-object-exists diff --git a/ingest/vendored/sha256sum b/ingest/vendored/scripts/sha256sum similarity index 100% rename from ingest/vendored/sha256sum rename to ingest/vendored/scripts/sha256sum diff --git a/ingest/vendored/trigger b/ingest/vendored/scripts/trigger similarity index 100% rename from ingest/vendored/trigger rename to ingest/vendored/scripts/trigger diff --git a/ingest/vendored/trigger-on-new-data b/ingest/vendored/scripts/trigger-on-new-data similarity index 100% rename from ingest/vendored/trigger-on-new-data rename to ingest/vendored/scripts/trigger-on-new-data diff --git a/ingest/vendored/upload-to-s3 b/ingest/vendored/scripts/upload-to-s3 similarity index 100% rename from ingest/vendored/upload-to-s3 rename to ingest/vendored/scripts/upload-to-s3 diff --git a/ingest/vendored/snakemake/config.smk b/ingest/vendored/snakemake/config.smk new file mode 100644 index 0000000..76b2b75 --- /dev/null +++ b/ingest/vendored/snakemake/config.smk @@ -0,0 +1,156 @@ +""" +Shared functions to be used within a Snakemake workflow for handling +workflow configs. +""" +import os +import sys +import yaml +from collections.abc import Callable +from typing import Optional +from textwrap import dedent, indent + + +# Set search paths for Augur +if "AUGUR_SEARCH_PATHS" in os.environ: + print(dedent(f"""\ + Using existing search paths in AUGUR_SEARCH_PATHS: + + {os.environ["AUGUR_SEARCH_PATHS"]!r} + """), file=sys.stderr) +else: + # Note that this differs from the search paths used in + # resolve_config_path(). + # This is the preferred default moving forwards, and the plan is to + # eventually update resolve_config_path() to use AUGUR_SEARCH_PATHS. + search_paths = [ + # User analysis directory + Path.cwd(), + + # Workflow defaults folder + Path(workflow.basedir) / "defaults", + + # Workflow root (contains Snakefile) + Path(workflow.basedir), + ] + + # This should work for majority of workflows, but we could consider doing a + # more thorough search for the nextstrain-pathogen.yaml. This would likely + # replicate how CLI searches for the root.¹ + # ¹ + repo_root = Path(workflow.basedir) / ".." + if (repo_root / "nextstrain-pathogen.yaml").is_file(): + search_paths.extend([ + # Pathogen repo root + repo_root, + ]) + + search_paths = [path.resolve() for path in search_paths if path.is_dir()] + + os.environ["AUGUR_SEARCH_PATHS"] = ":".join(map(str, search_paths)) + + +class InvalidConfigError(Exception): + pass + + +def resolve_config_path(path: str, defaults_dir: Optional[str] = None) -> Callable: + """ + Resolve a relative *path* given in a configuration value. Will always try to + resolve *path* after expanding wildcards with Snakemake's `expand` functionality. + + Returns the path for the first existing file, checked in the following order: + 1. relative to the analysis directory or workdir, usually given by ``--directory`` (``-d``) + 2. relative to *defaults_dir* if it's provided + 3. relative to the workflow's ``defaults/`` directory if *defaults_dir* is _not_ provided + + This behaviour allows a default configuration value to point to a default + auxiliary file while also letting the file used be overridden either by + setting an alternate file path in the configuration or by creating a file + with the conventional name in the workflow's analysis directory. + """ + global workflow + + def _resolve_config_path(wildcards): + try: + expanded_path = expand(path, **wildcards)[0] + except snakemake.exceptions.WildcardError as e: + available_wildcards = "\n".join(f" - {wildcard}" for wildcard in wildcards) + raise snakemake.exceptions.WildcardError(indent(dedent(f"""\ + {str(e)} + + However, resolve_config_path({{path}}) requires the wildcard. + + Wildcards available for this path are: + + {{available_wildcards}} + + Hint: Check that the config path value does not misspell the wildcard name + and that the rule actually uses the wildcard name. + """.lstrip("\n").rstrip()).format(path=repr(path), available_wildcards=available_wildcards), " " * 4)) + + if os.path.exists(expanded_path): + return expanded_path + + if defaults_dir: + defaults_path = os.path.join(defaults_dir, expanded_path) + else: + # Special-case defaults/… for backwards compatibility with older + # configs. We could achieve the same behaviour with a symlink + # (defaults/defaults → .) but that seems less clear. + if path.startswith("defaults/"): + defaults_path = os.path.join(workflow.basedir, expanded_path) + else: + defaults_path = os.path.join(workflow.basedir, "defaults", expanded_path) + + if os.path.exists(defaults_path): + return defaults_path + + raise InvalidConfigError(indent(dedent(f"""\ + Unable to resolve the config-provided path {path!r}, + expanded to {expanded_path!r} after filling in wildcards. + The workflow does not include the default file {defaults_path!r}. + + Hint: Check that the file {expanded_path!r} exists in your analysis + directory or remove the config param to use the workflow defaults. + """), " " * 4)) + + return _resolve_config_path + + +def write_config(path, section=None): + """ + Write Snakemake's 'config' variable, or a section of it, to a file. + + *section* is an optional list of keys to navigate to a specific section of + config. If provided, only that section will be written. + """ + global config + + os.makedirs(os.path.dirname(path), exist_ok=True) + + data = config + section_str = "config" + + if section: + # Navigate to the specified section + for key in section: + # Error if key doesn't exist + if key not in data: + raise Exception(f"ERROR: Key {key!r} not found in {section_str!r}.") + + data = data[key] + section_str += f".{key}" + + # Error if value is not a mapping + if not isinstance(data, dict): + raise Exception(f"ERROR: {section_str!r} is not a mapping of key/value pairs.") + + with open(path, 'w') as f: + yaml.dump(data, f, sort_keys=False, Dumper=NoAliasDumper) + + print(f"Saved {section_str!r} to {path!r}.", file=sys.stderr) + + +class NoAliasDumper(yaml.SafeDumper): + def ignore_aliases(self, data): + return True diff --git a/ingest/vendored/snakemake/remote_files.smk b/ingest/vendored/snakemake/remote_files.smk new file mode 100644 index 0000000..844f80e --- /dev/null +++ b/ingest/vendored/snakemake/remote_files.smk @@ -0,0 +1,159 @@ +""" +Helper functions to set-up storage plugins for remote inputs/outputs. See the +docstring of `path_or_url` for usage instructions. + +The errors raised by storage plugins are often confusing. For instance, a HTTP +404 error will result in a `MissingInputException` with little hint as to the +underlying issue. S3 credentials errors are similarly confusing and we attempt +to check these ourselves to improve UX here. +""" + +from urllib.parse import urlparse + +# Keep a list of known public buckets, which we'll allow uncredentialled (unsigned) access to +# We could make this config-definable in the future +PUBLIC_BUCKETS = set(['nextstrain-data']) + +# Keep track of registered storage plugins to enable reuse +_storage_registry = {} + +class RemoteFilesMissingCredentials(Exception): + pass + +def _storage_s3(*, bucket, keep_local, retries) -> snakemake.storage.StorageProviderProxy: + """ + Registers and returns an instance of snakemake-storage-plugin-s3. Typically AWS + credentials are required for _any_ request however we allow requests to known + public buckets (see `PUBLIC_BUCKETS`) to be unsigned which allows for a nice user + experience in the common case of downloading inputs from s3://nextstrain-data. + + The intended behaviour for various (S3) URIs supplied to `path_or_url` is: + + | | S3 buckets | credentials present | credentials missing | + |----------|----------------------------|---------------------|---------------------| + | download | private / private + public | signed | Credentials Error | + | | public | signed | unsigned | + | upload | private / private + public | signed | Credentials Error | + | | public | signed | AccessDenied Error | + """ + # If the bucket is public then we may use an unsigned request which has the nice UX + # of not needing credentials to be present. If we've made other signed requests _or_ + # credentials are present then we just sign everything. This has implications for upload: + # if you attempt to upload to a public bucket without credentials then we allow that here + # and you'll get a subsequent `AccessDenied` error when the upload is attempted. + if bucket in PUBLIC_BUCKETS and \ + "s3_signed" not in _storage_registry and \ + ("s3_unsigned" in _storage_registry or not _aws_credentials_present()): + + if provider:=_storage_registry.get('s3_unsigned', None): + return provider + + from botocore import UNSIGNED # dependency of snakemake-storage-plugin-s3 + storage s3_unsigned: + provider="s3", + signature_version=UNSIGNED, + retries=retries, + keep_local=keep_local, + + _storage_registry['s3_unsigned'] = storage.s3_unsigned + return _storage_registry['s3_unsigned'] + + # Resource fetched/uploaded via a signed request, which will require AWS credentials + if provider:=_storage_registry.get('s3_signed', None): + return provider + + # Enforce the presence of credentials to paper over + if not _aws_credentials_present(): + raise RemoteFilesMissingCredentials() + + # the tag appears in the local file path, so reference 'signed' to give a hint about credential errors + storage s3_signed: + provider="s3", + retries=retries, + keep_local=keep_local, + + _storage_registry['s3_signed'] = storage.s3_signed + return _storage_registry['s3_signed'] + +def _aws_credentials_present() -> bool: + import boto3 # dependency of snakemake-storage-plugin-s3 + session = boto3.Session() + creds = session.get_credentials() + return creds is not None + +def _storage_http(*, keep_local, retries) -> snakemake.storage.StorageProviderProxy: + """ + Registers and returns an instance of snakemake-storage-plugin-http + """ + if provider:=_storage_registry.get('http', None): + return provider + + storage: + provider="http", + allow_redirects=True, + supports_head=True, + keep_local=keep_local, + retries=retries, + + _storage_registry['http'] = storage.http + return _storage_registry['http'] + + +def path_or_url(uri, *, keep_local=True, retries=2) -> str: + """ + Intended for use in Snakemake inputs / outputs to transparently use remote + resources. Returns the URI wrapped by an applicable storage plugin. Local + filepaths will be returned unchanged. + + For example, the following rule will download inputs from HTTPs and upload + the output to S3: + + rule filter: + input: + sequences = path_or_url("https://data.nextstrain.org/..."), + metadata = path_or_url("https://data.nextstrain.org/..."), + output: + sequences = path_or_url("s3://...") + shell: + r''' + augur filter \ + --sequences {input.sequences:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns accession \ + --output-sequences {output.sequences:q} + ''' + + If *keep_local* is True (the default) then downloaded/uploaded files will + remain in `.snakemake/storage/`. The presence of a previously downloaded + file (via `keep_local=True`) does not guarantee that the file will not be + re-downloaded if the storage plugin decides the local file is out of date. + + Depending on the *uri* authentication may be required. See the specific + helper functions (such as `_storage_s3`) for more details. + + See for + more information on Snakemake storage plugins. Note: various snakemake + plugins will be required depending on the URIs provided. + """ + info = urlparse(uri) + + if info.scheme=='': # local + return uri # no storage wrapper + + if info.scheme=='s3': + try: + return _storage_s3(bucket=info.netloc, keep_local=keep_local, retries=retries)(uri) + except RemoteFilesMissingCredentials as e: + raise Exception(f"AWS credentials are required to access {uri!r}") from e + + if info.scheme=='https': + return _storage_http(keep_local=keep_local, retries=retries)(uri) + elif info.scheme=='http': + raise Exception(f"HTTP remote file support is not implemented in nextstrain workflows (attempting to access {uri!r}).\n" + "Please use an HTTPS address instead.") + + if info.scheme in ['gs', 'gcs']: + raise Exception(f"Google Storage is not yet implemented for nextstrain workflows (attempting to access {uri!r}).\n" + "Please get in touch if you require this functionality and we can add it to our workflows") + + raise Exception(f"Input address {uri!r} (scheme={info.scheme!r}) is from a non-supported remote") From 23e46bcf26357abf9108bb32d3e2baa4ab8b9dd1 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 15 Apr 2026 10:19:19 -0700 Subject: [PATCH 2/2] ingest: fix vendored script paths Follow up to automated update of the vendored subrepo in the previous commit. --- ingest/build-configs/nextstrain-automation/upload.smk | 2 +- ingest/rules/fetch_from_ncbi.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk index 1f1929b..d51ec53 100644 --- a/ingest/build-configs/nextstrain-automation/upload.smk +++ b/ingest/build-configs/nextstrain-automation/upload.smk @@ -30,7 +30,7 @@ rule upload_to_s3: cloudfront_domain=config["cloudfront_domain"], shell: r""" - ./vendored/upload-to-s3 \ + ./vendored/scripts/upload-to-s3 \ {params.quiet} \ {input.file_to_upload:q} \ {params.s3_dst:q}/{wildcards.remote_file:q} \ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index db26308..8370670 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -95,7 +95,7 @@ rule fetch_ncbi_entrez_data: r""" exec &> >(tee {log:q}) - vendored/fetch-from-ncbi-entrez \ + vendored/scripts/fetch-from-ncbi-entrez \ --term {params.term:q} \ --output {output.genbank:q} """