nextstrain · joverlee521 · Apr 14, 2026 · Apr 15, 2026
diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk
@@ -30,7 +30,7 @@ rule upload_to_s3:
         cloudfront_domain=config["cloudfront_domain"],
     shell:
         r"""
-        ./vendored/upload-to-s3 \
+        ./vendored/scripts/upload-to-s3 \
             {params.quiet} \
             {input.file_to_upload:q} \
             {params.s3_dst:q}/{wildcards.remote_file:q} \

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -95,7 +95,7 @@ rule fetch_ncbi_entrez_data:
         r"""
         exec &> >(tee {log:q})
 
-        vendored/fetch-from-ncbi-entrez \
+        vendored/scripts/fetch-from-ncbi-entrez \
             --term {params.term:q} \
             --output {output.genbank:q}
         """

diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml
@@ -4,7 +4,7 @@
 # Each ecosystem is checked on a scheduled interval defined below.  To trigger
 # a check manually, go to
 #
-#   https://github.com/nextstrain/ingest/network/updates
+#   https://github.com/nextstrain/shared/network/updates
 #
 # and look for a "Check for updates" button.  You may need to click around a
 # bit first.

diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml
@@ -11,5 +11,5 @@ jobs:
   shellcheck:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
       - uses: nextstrain/.github/actions/shellcheck@master
diff --git a/ingest/vendored/.github/workflows/pre-commit.yaml b/ingest/vendored/.github/workflows/pre-commit.yaml
@@ -7,8 +7,8 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
         with:
           python-version: "3.12"
       - uses: pre-commit/action@v3.0.1
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/nextstrain/ingest
 	branch = main
-	commit = 258ab8ce898a88089bc88caee336f8d683a0e79a
-	parent = a7af2c05fc4ccc822c8ef38f0001dc5e8bee803b
+	commit = c29898f7c32c3f85d65db235d23a78e776f89120
+	parent = 2d06e5de2f761a090d45e0bfcb8b1d510fffdc83
 	method = merge
-	cmdver = 0.4.7
+	cmdver = 0.4.9
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
@@ -1,6 +1,6 @@
-# ingest
+# shared
 
-Shared internal tooling for pathogen data ingest.  Used by our individual
+Shared internal tooling for pathogen workflows.  Used by our individual
 pathogen repos which produce Nextstrain builds.  Expected to be vendored by
 each pathogen repo using `git subrepo`.
 
@@ -9,47 +9,47 @@ Some tools may only live here temporarily before finding a permanent home in
 
 ## Vendoring
 
-Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts.
-(See discussion on this decision in https://github.com/nextstrain/ingest/issues/3)
+Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor shared scripts.
+(See discussion on this decision in https://github.com/nextstrain/shared/issues/3)
 
 For a list of Nextstrain repos that are currently using this method, use [this
 GitHub code search](https://github.com/search?type=code&q=org%3Anextstrain+subrepo+%22remote+%3D+https%3A%2F%2Fgithub.com%2Fnextstrain%2Fingest%22).
 
 If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation).
-Then add the latest ingest scripts to the pathogen repo by running:
+Then add the latest shared scripts to the pathogen repo by running:
 
 ```
-git subrepo clone https://github.com/nextstrain/ingest ingest/vendored
+git subrepo clone https://github.com/nextstrain/shared shared/vendored
 ```
 
-Any future updates of ingest scripts can be pulled in with:
+Any future updates of shared scripts can be pulled in with:
 
 ```
-git subrepo pull ingest/vendored
+git subrepo pull shared/vendored
 ```
 
 If you run into merge conflicts and would like to pull in a fresh copy of the
-latest ingest scripts, pull with the `--force` flag:
+latest shared scripts, pull with the `--force` flag:
 
 ```
-git subrepo pull ingest/vendored --force
+git subrepo pull shared/vendored --force
 ```
 
 > **Warning**
 > Beware of rebasing/dropping the parent commit of a `git subrepo` update
 
-`git subrepo` relies on metadata in the `ingest/vendored/.gitrepo` file,
+`git subrepo` relies on metadata in the `shared/vendored/.gitrepo` file,
 which includes the hash for the parent commit in the pathogen repos.
 If this hash no longer exists in the commit history, there will be errors when
 running future `git subrepo pull` commands.
 
 If you run into an error similar to the following:
 ```
-$ git subrepo pull ingest/vendored
-git-subrepo: Command failed: 'git branch subrepo/ingest/vendored '.
+$ git subrepo pull shared/vendored
+git-subrepo: Command failed: 'git branch subrepo/shared/vendored '.
 fatal: not a valid object name: ''
 ```
-Check the parent commit hash in the `ingest/vendored/.gitrepo` file and make
+Check the parent commit hash in the `shared/vendored/.gitrepo` file and make
 sure the commit exists in the commit history. Update to the appropriate parent
 commit hash if needed.
 
@@ -84,39 +84,49 @@ approach to "ingest" has been discussed in various internal places, including:
 
 ## Scripts
 
-Scripts for supporting ingest workflow automation that don’t really belong in any of our existing tools.
+Scripts for supporting workflow automation that don’t really belong in any of our existing tools.
 
-- [notify-on-diff](notify-on-diff) - Send Slack message with diff of a local file and an S3 object
-- [notify-on-job-fail](notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch
-- [notify-on-job-start](notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch
-- [notify-on-record-change](notify-on-recod-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`.
+- [assign-colors](scripts/assign-colors) - Generate colors.tsv for augur export based on ordering, color schemes, and what exists in the metadata. Used in the phylogenetic or nextclade workflows.
+- [notify-on-diff](scripts/notify-on-diff) - Send Slack message with diff of a local file and an S3 object
+- [notify-on-job-fail](scripts/notify-on-job-fail) - Send Slack message with details about failed workflow job on GitHub Actions and/or AWS Batch
+- [notify-on-job-start](scripts/notify-on-job-start) - Send Slack message with details about workflow job on GitHub Actions and/or AWS Batch
+- [notify-on-record-change](scripts/notify-on-record-change) - Send Slack message with details about line count changes for a file compared to an S3 object's metadata `recordcount`.
   If the S3 object's metadata does not have `recordcount`, then will attempt to download S3 object to count lines locally, which only supports `xz` compressed S3 objects.
-- [notify-slack](notify-slack) - Send message or file to Slack
-- [s3-object-exists](s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts
-- [trigger](trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events.
-- [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message`
+- [notify-slack](scripts/notify-slack) - Send message or file to Slack
+- [s3-object-exists](scripts/s3-object-exists) - Used to prevent 404 errors during S3 file comparisons in the notify-* scripts
+- [trigger](scripts/trigger) - Triggers downstream GitHub Actions via the GitHub API using repository_dispatch events.
+- [trigger-on-new-data](scripts/trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message`
   A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated.
 
+
 NCBI interaction scripts that are useful for fetching public metadata and sequences.
 
-- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file.
+- [fetch-from-ncbi-entrez](scripts/fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file.
   Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs.
 
-Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18.
+Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/shared/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/shared/issues/18.
 
 Potential Nextstrain CLI scripts
 
-- [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.
-- [cloudfront-invalidate](cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104).
+- [sha256sum](scripts/sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts.
+- [cloudfront-invalidate](scripts/cloudfront-invalidate) - CloudFront invalidation is already supported in the [nextstrain remote command for S3 files](https://github.com/nextstrain/cli/blob/a5dda9c0579ece7acbd8e2c32a4bbe95df7c0bce/nextstrain/cli/remote/s3.py#L104).
   This exists as a separate script to support CloudFront invalidation when using the upload-to-s3 script.
-- [upload-to-s3](upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL.
+- [upload-to-s3](scripts/upload-to-s3) - Upload file to AWS S3 bucket with compression based on file extension in S3 URL.
   Skips upload if the local file's hash is identical to the S3 object's metadata `sha256sum`.
   Adds the following user defined metadata to uploaded S3 object:
-    - `sha256sum` - hash of the file generated by [sha256sum](sha256sum)
+    - `sha256sum` - hash of the file generated by [sha256sum](scripts/sha256sum)
     - `recordcount` - the line count of the file
-- [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL.
+- [download-from-s3](scripts/download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL.
   Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`.
 
+## Snakemake
+
+Snakemake workflow functions that are shared across many pathogen workflows that don’t really belong in any of our existing tools.
+
+- [config.smk](snakemake/config.smk) - Shared functions for handling workflow configs.
+- [remote_files.smk](snakemake/remote_files.smk) - Exposes the `path_or_url` function which will use Snakemake's storage plugins to download/upload files to remote providers as needed.
+
+
 ## Software requirements
 
 Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date.

diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack
diff --git a/ingest/vendored/scripts/assign-colors b/ingest/vendored/scripts/assign-colors
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Generate colors.tsv for augur export based on ordering, color schemes, and
+traits that exists in the metadata.
+"""
+import argparse
+import pandas as pd
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Assign colors based on defined ordering of traits.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument('--ordering', type=str, required=True,
+        help="""Input TSV file defining the color ordering where the first
+        column is the field and the second column is the trait in that field.
+        Blank lines are ignored. Lines starting with '#' will be ignored as comments.""")
+    parser.add_argument('--color-schemes', type=str, required=True,
+        help="Input color schemes where each line is a different color scheme separated by tabs.")
+    parser.add_argument('--metadata', type=str,
+        help="""If provided, restrict colors to only those traits found in
+        metadata. If the metadata includes a 'focal' column that only contains
+        boolean values, then restrict colors to traits for rows where 'focal'
+        is set to True.""")
+    parser.add_argument('--output', type=str, required=True,
+        help="Output colors TSV file to be passed to augur export.")
+    args = parser.parse_args()
+
+    assignment = {}
+    with open(args.ordering) as f:
+        for line in f.readlines():
+            array = line.strip().split("\t")
+            # Ignore empty lines or commented lines
+            if not array or not array[0] or array[0].startswith('#'):
+                continue
+            # Throw a warning if encountering a line not matching the expected number of columns, ignore line
+            elif len(array)!=2:
+                print(f"WARNING: Could not decode color ordering line: {line}")
+                continue
+            # Otherwise, process color ordering where we expect 2 columns: name, traits
+            else:
+                name = array[0]
+                trait = array[1]
+                if name not in assignment:
+                    assignment[name] = [trait]
+                else:
+                    assignment[name].append(trait)
+
+    # if metadata supplied, go through and
+    # 1. remove assignments that don't exist in metadata
+    # 2. remove assignments that have 'focal' set to 'False' in metadata
+    if args.metadata:
+        metadata = pd.read_csv(args.metadata, delimiter='\t')
+        for name, trait in assignment.items():
+            if name in metadata:
+                if 'focal' in metadata and metadata['focal'].dtype == 'bool':
+                    focal_list = metadata.loc[metadata['focal'], name].unique()
+                    subset_focal = [x for x in assignment[name] if x in focal_list]
+                    assignment[name] = subset_focal
+                else: # no 'focal' present
+                    subset_present = [x for x in assignment[name] if x in metadata[name].unique()]
+                    assignment[name] = subset_present
+
+
+    schemes = {}
+    counter = 0
+    with open(args.color_schemes) as f:
+        for line in f.readlines():
+            counter += 1
+            array = line.lstrip().rstrip().split("\t")
+            schemes[counter] = array
+
+    with open(args.output, 'w') as f:
+        for trait_name, trait_array in assignment.items():
+            if len(trait_array)==0:
+                print(f"No traits found for {trait_name}")
+                continue
+            if len(schemes)<len(trait_array):
+              print(f"WARNING: insufficient colours available for trait {trait_name} - reusing colours!")
+              remain = len(trait_array)
+              color_array = []
+              while(remain>0):
+                if (remain>len(schemes)):
+                  color_array = [*color_array, *schemes[len(schemes)]]
+                  remain -= len(schemes)
+                else:
+                  color_array = [*color_array, *schemes[remain]]
+                  remain = 0
+            else:
+              color_array = schemes[len(trait_array)]
+
+            zipped = list(zip(trait_array, color_array))
+            for trait_value, color in zipped:
+                f.write(trait_name + "\t" + trait_value + "\t" + color + "\n")
+            f.write("\n")
diff --git a/ingest/vendored/cloudfront-invalidate → ...st/vendored/scripts/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate → ...st/vendored/scripts/cloudfront-invalidate
diff --git a/ingest/vendored/download-from-s3 → ingest/vendored/scripts/download-from-s3 b/ingest/vendored/download-from-s3 → ingest/vendored/scripts/download-from-s3
diff --git a/ingest/vendored/fetch-from-ncbi-entrez → ...t/vendored/scripts/fetch-from-ncbi-entrez b/ingest/vendored/fetch-from-ncbi-entrez → ...t/vendored/scripts/fetch-from-ncbi-entrez
diff --git a/ingest/vendored/notify-on-diff → ingest/vendored/scripts/notify-on-diff b/ingest/vendored/notify-on-diff → ingest/vendored/scripts/notify-on-diff
diff --git a/ingest/vendored/notify-on-job-fail → ingest/vendored/scripts/notify-on-job-fail b/ingest/vendored/notify-on-job-fail → ingest/vendored/scripts/notify-on-job-fail
diff --git a/ingest/vendored/notify-on-job-start → ingest/vendored/scripts/notify-on-job-start b/ingest/vendored/notify-on-job-start → ingest/vendored/scripts/notify-on-job-start
diff --git a/ingest/vendored/notify-on-record-change → .../vendored/scripts/notify-on-record-change b/ingest/vendored/notify-on-record-change → .../vendored/scripts/notify-on-record-change