diff --git a/1_sequence_prediction.py b/1_sequence_prediction.py new file mode 100755 index 0000000..3baf42c --- /dev/null +++ b/1_sequence_prediction.py @@ -0,0 +1,74 @@ +""" +Description: + CLI for sequence prediction using the Sei deep learning model, + given an input FASTA or BED file. + Outputs Sei chromatin profile predictions. + +Usage: + 1_sequence_prediction.py [--genome=] [--cuda] + 1_sequence_prediction.py -h | --help + +Options: + -h --help Show this screen. + Input FASTA or BED file. + Output directory + --genome= If is a BED file, specify the reference + genome hg38 or hg19 [default: hg19] + --cuda Run variant effect prediction on a CUDA-enabled + GPU + +""" +import os + +from docopt import docopt + +from selene_sdk.sequences import Genome +from selene_sdk.utils import load_path +from selene_sdk.utils import parse_configs_and_run + + +def _finditem(obj, val): + for k, v in obj.items(): + if hasattr(v, 'keywords'): + _finditem(v.keywords, val) + elif isinstance(v, dict): + _finditem(v, val) + elif isinstance(v, str) and '' in v: + obj[k] = v.replace('', val) + + +if __name__ == "__main__": + arguments = docopt( + __doc__, + version='0.0.0') + + seq_input = arguments[""] + + os.makedirs(arguments[""], exist_ok=True) + + # Assumes that the `models` directory is in the same directory as this + # script. Please update this line if not. + use_dir = os.path.dirname(os.path.abspath(__file__)) + use_cuda = arguments["--cuda"] + + sei_out = os.path.join(arguments[""], "chromatin-profiles-hdf5") + os.makedirs(sei_out, exist_ok=True) + + configs = load_path("./model/sei_seq_prediction.yml", instantiate=False) + _finditem(configs, use_dir) + + configs["prediction"].update(input_path=seq_input, output_dir=sei_out) + configs["analyze_sequences"].bind(use_cuda=use_cuda) + + # Assumes BED file coordinates input if file doesn't end with .fa or .fasta + if not seq_input.endswith('.fa') and not seq_input.endswith('.fasta'): + hg_version = arguments["--genome"] + genome = None + if hg_version == 'hg38' or hg_version == 'hg19': + genome = Genome( + os.path.join('.', 'resources', '{0}_UCSC.fa'.format(hg_version))) + configs["analyze_sequences"].bind(reference_sequence=genome) + else: + raise ValueError("--genome= must be 'hg19' or 'hg38'") + parse_configs_and_run(configs) + diff --git a/1_sequence_prediction.sh b/1_sequence_prediction.sh new file mode 100755 index 0000000..e1149df --- /dev/null +++ b/1_sequence_prediction.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +##################################################################### +# Example script for running Sei deep learning model sequence +# prediction with Selene + +# Usage: +# sh 1_sequence_prediction.sh --cuda + +# Please only specify hg38 or hg19 as input for if is +# a BED file. If you are using a FASTA file of sequences, you can specify +# whatever genome version you wish for your own reference (will be printed +# as part of output for this script) but do not leave it empty. + +# --cuda is optional, use if you are running on a CUDA-enabled +# GPU machine (see example_slurm_scripts/1_example_seqpred.slurm_gpu.sh) +##################################################################### + +set -o errexit +set -o pipefail +set -o nounset + +input_filepath="${1:-}" +hg_version="${2:-}" +out_dir="${3:-}" +cuda="${4:-}" + +mkdir -p $out_dir + +input_basename=$(basename $input_filepath) +cp $input_filepath $out_dir/ + +echo "Input argments: $input_filepath $out_dir $hg_version $cuda" + +if [ "$cuda" = "--cuda" ] +then + echo "use_cuda: True" + python -u 1_sequence_prediction.py \ + "$out_dir/$input_basename" \ + $out_dir \ + --genome=${hg_version} \ + --cuda +else + echo "use_cuda: False" + python -u 1_sequence_prediction.py \ + "$out_dir/$input_basename" \ + $out_dir \ + --genome=${hg_version} +fi diff --git a/vep_cli.py b/1_variant_effect_prediction.py similarity index 88% rename from vep_cli.py rename to 1_variant_effect_prediction.py index dbfb269..e6e9a9b 100755 --- a/vep_cli.py +++ b/1_variant_effect_prediction.py @@ -1,11 +1,12 @@ """ Description: - CLI for variant effect prediction using the Sei deep learning model. - Outputs both Sei chromatin profile predictions and sequence class scores. + CLI for variant effect prediction using the Sei deep learning model, + given an input VCF file. + Outputs Sei chromatin profile predictions. Usage: - vep_cli.py [--genome=] [--cuda] - vep_cli.py -h | --help + 1_variant_effect_prediction.py [--genome=] [--cuda] + 1_variant_effect_prediction.py -h | --help Options: -h --help Show this screen. @@ -69,5 +70,5 @@ def run_config(config_yml, output_dir): sei_out = os.path.join(arguments[""], "chromatin-profiles-hdf5") os.makedirs(sei_out, exist_ok=True) - run_config("./model/sei_prediction.yml", sei_out) + run_config("./model/sei_varianteffect_prediction.yml", sei_out) diff --git a/run_pipeline.sh b/1_variant_effect_prediction.sh similarity index 50% rename from run_pipeline.sh rename to 1_variant_effect_prediction.sh index 128e8d5..e081124 100755 --- a/run_pipeline.sh +++ b/1_variant_effect_prediction.sh @@ -1,15 +1,16 @@ #!/usr/bin/env bash ############################################################### -# Example script for running the variant effect prediction -# pipeline for Sei and sequence classes. +# Example script for running Sei variant effect prediction +# using Selene. -# sh run_pipeline.sh --cuda +# Usage: +# sh 1_variant_effect_prediction.sh [--cuda] # Please only specify hg38 or hg19 as input for . # --cuda is optional, use if you are running on a CUDA-enabled -# GPU machine +# GPU machine (see example_slurm_scripts/1_example_vep.slurm_gpu.sh) ############################################################### set -o errexit @@ -23,21 +24,23 @@ cuda="${4:-}" mkdir -p $outdir +echo "Input arguments: $vcf_filepath $hg_version $outdir $cuda" + vcf_basename=$(basename $vcf_filepath) cp $vcf_filepath $outdir/ if [ "$cuda" = "--cuda" ] then echo "use_cuda: True" - python vep_cli.py "$outdir/$vcf_basename" \ - $outdir \ - --genome=${hg_version} \ - --cuda + python -u 1_variant_effect_prediction.py \ + "$outdir/$vcf_basename" \ + $outdir \ + --genome=${hg_version} \ + --cuda else echo "use_cuda: False" - python vep_cli.py "$outdir/$vcf_basename" \ - $outdir \ - --genome=${hg_version} + python -u 1_variant_effect_prediction.py \ + "$outdir/$vcf_basename" \ + $outdir \ + --genome=${hg_version} fi - -python sequence_class.py $outdir "$vcf_basename" diff --git a/2_raw_sc_score.py b/2_raw_sc_score.py new file mode 100755 index 0000000..e859fc2 --- /dev/null +++ b/2_raw_sc_score.py @@ -0,0 +1,67 @@ +""" +Description: + This script is run after `1_sequence_prediction.py`. It computes the + raw sequence class scores based on Sei chromatin profile predictions + of sequences (i.e. no variants). + +Usage: + 2_raw_sc_score.py + [--out-name=] + 2_raw_sc_score.py -h | --help + +Options: + Path to Sei sequence predictions to compute raw + sequence class projection scores (NO variant effect). + The directory to output the sequence class scores + as an .NPY file. + --out-name= Specify an output filename prefix. Otherwise, output + filenames will be based on . + +""" +import os + +from docopt import docopt +import h5py +import numpy as np +import pandas as pd + +from utils import get_filename_prefix, get_data, get_targets +from utils import sc_projection + + +if __name__ == "__main__": + arguments = docopt( + __doc__, + version='1.0.0') + output_dir = arguments[''] + os.makedirs(output_dir, exist_ok=True) + + input_pred_file = arguments[''] + input_preds = get_data(input_pred_file) + input_dir, input_fn = os.path.split(input_pred_file) + + input_prefix = get_filename_prefix(input_fn) + rowlabels_file = os.path.join(input_dir, '{0}_row_labels.txt'.format( + input_prefix)) + rowlabels = pd.read_csv(rowlabels_file, sep='\t') + if len(rowlabels) != len(input_preds): + raise ValueError(("Rowlabels file '{0}' does not have the same number " + "of rows as '{1}'").format(rowlabels_file, + input_pred_file)) + + output_prefix = arguments['--out-name'] + if output_prefix is None: + output_prefix = input_fn.split('_predictions')[0] + print("Output files will start with prefix '{0}'".format(output_prefix)) + + sei_dir = "./model" + chromatin_profiles = get_targets(os.path.join(sei_dir, "target.names")) + seqclass_names = get_targets(os.path.join(sei_dir, "seqclass.names")) + + clustervfeat = np.load(os.path.join(sei_dir, 'projvec_targets.npy')) + + projscores = sc_projection(input_preds, clustervfeat) + np.save(os.path.join( + output_dir, "{0}.raw_sequence_class_scores.npy".format(output_prefix)), + projscores) + diff --git a/2_raw_sc_score.sh b/2_raw_sc_score.sh new file mode 100755 index 0000000..311cec0 --- /dev/null +++ b/2_raw_sc_score.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +##################################################################### +# Example script for computing the raw sequence class scores +# given Sei chromatin profile sequence predictions + +# Usage: +# sh 2_raw_sc_score.sh +##################################################################### + +set -o errexit +set -o pipefail +set -o nounset + +input_filepath="${1:-}" +out_dir="${2:-}" + +mkdir -p $out_dir + +echo "Input argments: $input_filepath $out_dir" + +python -u 2_raw_sc_score.py $input_filepath $out_dir diff --git a/2_varianteffect_sc_score.py b/2_varianteffect_sc_score.py new file mode 100755 index 0000000..931e8d9 --- /dev/null +++ b/2_varianteffect_sc_score.py @@ -0,0 +1,115 @@ +""" +Description: + This script is run after `1_variant_effect_prediction.py`. It computes + the variant effect sequence class scores based on Sei chromatin profile + predictions, and by default, sorts the variants based on the maximum + absolute scores across sequence classes, outputting the results as + TSV files. + +Usage: + 2_varianteffect_sc_score.py + [--out-name=] + [--no-tsv] + 2_varianteffect_sc_score.py -h | --help + +Options: + Reference sequence Sei predictions file. Assumes + the row labels file is in the same directory. The resulting + variant effect sequence class scores will be computed as + alt - ref sequence class scores (adjusted for nucleosome + occupancy). + Alternate sequence Sei predictions file. + The directory to output the sequence class scores + & TSVs (if `--no-tsv` is not used). + --out-name= Specify an output filename prefix that all outputted + files will use. Otherwise, filenames will be based on + . + --no-tsv The TSVs outputted sort the variants based on maximum + absolute scores across sequence classes and are intended + for easier perusal of the predictions for those more + familiar with this file type, but makes this script take + much longer to complete as a result. If you are comfortable + working with HDF5 and NPY files, you can suppress the TSV + output and use the files in `chromatin-profiles-hdf5`. + +""" +import os + +from docopt import docopt +import h5py +import numpy as np +import pandas as pd + +from utils import get_filename_prefix, get_data, get_targets +from utils import sc_hnorm_varianteffect +from utils import write_to_tsv + + +if __name__ == "__main__": + arguments = docopt( + __doc__, + version='1.0.0') + ref_pred_file = arguments[''] + alt_pred_file = arguments[''] + + # load predictions + chromatin_profile_ref = get_data(ref_pred_file) + chromatin_profile_alt = get_data(alt_pred_file) + if len(chromatin_profile_ref) != len(chromatin_profile_alt): + raise ValueError(("{0} and {1} have different number of rows: {2} vs {3}, " + "respectively.").format(ref_pred_file, alt_pred_file, + len(chromatin_profile_ref), + len(chromatin_profile_alt))) + + alt_dir, alt_fn = os.path.split(alt_pred_file) + + # checks if the ref/alt are from variant effect prediction (VCF) + # or sequence prediction (BED or FASTA file inputs) + alt_prefix = get_filename_prefix(alt_fn) + rowlabels_file = os.path.join(alt_dir, '{0}_row_labels.txt'.format(alt_prefix)) + rowlabels = pd.read_csv(rowlabels_file, sep='\t') + if len(rowlabels) != len(chromatin_profile_alt): + raise ValueError(("Rowlabels file '{0}' does not have the same number " + "of rows as '{1}'").format(rowlabels_file, alt_pred_file)) + + output_dir = arguments[''] + os.makedirs(output_dir, exist_ok=True) + + output_prefix = arguments['--out-name'] + if output_prefix is None: + output_prefix = alt_prefix + print("Output files will start with prefix '{0}'".format(output_prefix)) + + no_tsv = arguments['--no-tsv'] + + sei_dir = "./model" + chromatin_profiles = get_targets(os.path.join(sei_dir, "target.names")) + seqclass_names = get_targets(os.path.join(sei_dir, "seqclass.names")) + + clustervfeat = np.load(os.path.join(sei_dir, 'projvec_targets.npy')) + histone_inds = np.load(os.path.join(sei_dir, 'histone_inds.npy')) + + diffproj = sc_hnorm_varianteffect( + chromatin_profile_ref, + chromatin_profile_alt, + clustervfeat, + histone_inds) + max_abs_diff = np.abs(diffproj).max(axis=1) + + np.save(os.path.join( + output_dir, "{0}.sequence_class_scores.npy".format(output_prefix)), diffproj) + + if not no_tsv: + write_to_tsv(max_abs_diff, # max sequence class score + chromatin_profile_alt - chromatin_profile_ref, # chromatin profile diffs + diffproj, # sequence class diffs + chromatin_profiles, # chromatin profile targets + seqclass_names, # sequence class names + rowlabels, + os.path.join(output_dir, + "sorted.{0}.chromatin_profile_diffs.tsv".format( + output_prefix)), + os.path.join(output_dir, + "sorted.{0}.sequence_class_scores.tsv".format( + output_prefix))) + diff --git a/2_varianteffect_sc_score.sh b/2_varianteffect_sc_score.sh new file mode 100755 index 0000000..1f742d7 --- /dev/null +++ b/2_varianteffect_sc_score.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +##################################################################### +# Example script for computing the variant effect sequence class scores +# given Sei chromatin profile variant effect predictions + +# Usage: +# sh 2_varianteffect_sc_score.sh +# [--no-tsv] +##################################################################### + +set -o errexit +set -o pipefail +set -o nounset + +ref_fp="${1:-}" +alt_fp="${2:-}" +out_dir="${3:-}" +no_tsv="${4:-}" + +mkdir -p $out_dir + +echo "Input argments: $ref_fp $alt_fp $out_dir $no_tsv" + +if [ "$no_tsv" = "--no-tsv" ] +then + echo "--no-tsv flag is used" + python -u 2_varianteffect_sc_score.py $ref_fp $alt_fp $out_dir --no-tsv +else + echo "--no-tsv flag is not used" + python -u 2_varianteffect_sc_score.py $ref_fp $alt_fp $out_dir +fi + diff --git a/README.md b/README.md index 5d87880..f77665c 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,14 @@ -# Sei framework +

+ +

+ + Welcome to the Sei framework repository! Sei is a framework for systematically predicting sequence regulatory activities and applying sequence information to human genetics data. Sei provides a global map from any sequence to regulatory activities, as represented by 40 sequence classes, and each sequence class integrates predictions for 21,907 chromatin profiles (transcription factor, histone marks, and chromatin accessibility profiles across a wide range of cell types). -This repository can be used to run the Sei model and get the Sei chromatin profile and sequence class predictions for an input VCF file. +Sei is now published, you can read the manuscript [here](https://doi.org/10.1038/s41588-022-01102-2). + +This repository can be used to run the Sei model and get the Sei chromatin profile and sequence class predictions for input sequences or variants. We also provide information and instructions for [how to train the Sei deep learning sequence model](#training). @@ -10,10 +16,6 @@ We also provide information and instructions for [how to train the Sei deep lear Sei requires Python 3.6+ and Python packages PyTorch (>=1.0), Selene (>=0.5.0), and `docopt`. You can follow PyTorch installation steps [here](https://pytorch.org/get-started/locally/) and Selene installation steps [here](https://github.com/FunctionLab/selene). Install `docopt` with pip or conda (e.g. `conda install docopt`) -## Variant effect prediction - -Sei predicts variant effects by comparing predictions from a pair of sequences carrying the reference allele and the alternative allele respectively. Our code provides both sequence-class level (40 classes) and chromatin profile-level (21,907 targets) predictions. - ### Setup Please download and extract the trained Sei model and `resources` (containing hg19 and hg38 FASTA files) `.tar.gz` files before proceeding: @@ -25,40 +27,105 @@ sh ./download_data.sh - [Sei model](https://doi.org/10.5281/zenodo.4906996) - [Sei framework `resources` directory](https://doi.org/10.5281/zenodo.4906961) -### Usage +### Chromatin profile prediction + +1. The following scripts can be used to obtain Sei deep learning predictions for 21,907 chromatin profiles (please run on a GPU node): +(1) `1_sequence_prediction.py` (and corresponding bash script, `1_sequence_prediction.sh`): Accepts either a BED (`.bed`) or FASTA (`.fa`, `.fasta`) file as input and makes sequence predictions. + +Example usage: ``` -sh run_pipeline.sh [--cuda] +sh 1_sequence_prediction.sh --cuda ``` -Example command + +Arguments: +- ``: BED or FASTA input file +- ``: If you use a BED file as input, this must be either `hg19` or `hg38` as these are the FASTA reference genome files we provide by default. If you are using a FASTA file, you can specify whichever genome version you are using for logging purposes. +- ``: Path to output directory (will be created if does not exist) +- `--cuda`: Optional, use this flag if running on a CUDA-enabled GPU. + +You can run `python 1_sequence_prediction.py -h` for the full documentation of inputs. + +2. `1_variant_effect_prediction.py` (and corresponding bash script, `1_variant_effect_prediction.sh`): Accepts a VCF file as input and makes variant effect predictions. + +Example usage: ``` -sh run_pipeline.sh test.vcf hg19 test-output --cuda +sh 1_variant_effect_prediction.sh [--cuda] ``` Arguments: -- ``: Input VCF file -- ``: Reference FASTA. By default the framework accepts `hg19` or `hg38` coordinates. +- ``: VCF file +- ``: Either hg19 or hg38 - ``: Path to output directory (will be created if does not exist) - `--cuda`: Optional, use this flag if running on a CUDA-enabled GPU. -We provide `test.vcf` (hg19 coordinates) so you can try running this command once you have installed all the requirements. Additionally, `run_pipeline.gpu_node.sh` is an example SLURM script with the same expected input arguments if you need to submit your job to a compute cluster. +You can run `python 1_variant_effect_prediction.py -h` for the full documentation of inputs. + +These scripts will output the chromatin profile predictions as HDF5 files to a subdirectory `chromatin-profiles-hdf5` in your specified output directory. + +See `example_slurm_scripts/1_example_seqpred.slurm_gpu.sh` and `example_slurm_scripts/1_example_vep.slurm_gpu.sh` for sample scripts for running chromatin profile prediction on SLURM. -**Additional note**: we have added the capability of predicting variant effects from a pair of sequences in the `vep_cli_seq.py` script in the [`vep_seq`](https://github.com/FunctionLab/sei-framework/tree/vep_seq) development branch of this repo. -### Outputs +### Sequence class prediction -The following files and directories will be outputted: -- `chromatin-profiles-hdf5`: directory -- `chromatin_profiles_diffs.tsv`: chromatin profile prediction TSV file (**Note:** output file will be compressed if input has >10000 variants) -- `sequence_classes_scores.tsv`: sequence class prediction TSV file +Sequence class scores can be obtained from Sei chromatin profile predictions. There are 2 types of scores that can be computed: -The two `*.tsv` files are the final formatted outputs, while the `chromatin-profiles-hdf5` directory contains the intermediate HDF5 and row label files outputted from Selene from running the Sei deep learning model. +- Raw sequence class scores: For sequences only. Raw sequence class scores are projection scores of chromatin profile predictions projected on the unit-length vectors representing each sequence class. This is an intermediate score originally developed for variant score prediction and is made available for use for developing downstream analyses or applications, such as using them as a sequence representation. **Note** our manuscript uses the Louvain community clustering, whole-genome sequence class annotation of the human genome whenever we apply sequence classes to reference genome sequences, and we encourage the use of these annotations over the raw sequence class scores when possible. Sequence class annotations for hg38 and hg19 (lifted over from hg38) are available for download from [this Zenodo record](10.5281/zenodo.7113989). +- Sequence class variant effect score (nucleosome-occupancy-adjusted): For variants only. Computed as alt - ref of the raw sequence class scores **adjusted for nucleosome occupancy, i.e. histone normalized**. To better represent predicted variant effects on histone marks, it is necessary to normalize for nucleosome occupancy (for example, a LoF mutation near the TSS can decrease H3K4me3 modification level while increasing nucleosome occupancy, resulting in an overall increase in observed H3K4me3 quantity). Therefore, for variant effect computation, we used the sum of all histone profile predictions as an approximation to nucleosome occupancy and adjusted all histone mark predictions to remove the impact of nucleosome occupancy change (nonhistone mark predictions are unchanged). See manuscript methods for more detail. -You can use the HDF5 files directly if desired, but please keep in mind that the variants will not be ordered in the same way as the TSV files. (Please see the corresponding `*_row_labels.txt` file, for the variant labels.) +#### Sequence prediction +Example usage: +``` +sh 2_raw_sc_score.sh +``` + +Arguments: +- ``: Path to the Sei `_predictions.h5` file. +- ``: Path to output directory (will be created if does not exist) +You can run `python 2_raw_sc_score.py -h` for the full documentation of inputs. -### Sequence classes +#### Variant effect prediction + +Example usage: +``` +sh 2_varianteffect_sc_score.sh [--no-tsv] +``` + +Arguments: +- ``: Path to the Sei `.ref_predictions.h5` file. +- ``: Path to the Sei `.alt_predictions.h5` file. +- ``: Path to output directory (will be created if does not exist) +- `--no-tsv`: Optional flag if you'd like to suppress the outputted TSV files (see the next section 'Example variant effect prediction run' for more information). + +You can run `python 2_varianteffect_sc_score.py -h` for the full documentation of inputs. + +### Example variant effect prediction run: + +We provide `test.vcf` (hg19 coordinates) so you can try running this command once you have installed all the requirements. Additionally, `example_slurm_scripts` contains example scripts with the same expected input arguments if you need to submit your job to a compute cluster. + +Example command run on GPU: +``` +sh 1_variant_effect_prediction.sh test.vcf hg19 ./test_outputs --cuda +``` + +Example command run on CPU: +``` +sh 2_varianteffect_sc_score.sh ./test_outputs/chromatin-profiles-hdf5/test.ref_predictions.h5 \ + ./test_outputs/chromatin-profiles-hdf5/test.alt_predictions.h5 \ + ./test_outputs +``` +You can add `--no-tsv` to this command to suppress the TSV file outputs if you are comfortable working with HDF5 and NPY files. Note you will need to match the rows to the `test_row_labels.txt` file in `./test_outputs/chromatin-profiles.hdf5` and the columns to `./model/target.names` (chromatin profile HDF5 files) and `./model/seqclass.names` (sequence class NPY file). + + +Expected outputs: +- `chromatin-profiles-hdf5`: directory containing HDF5 Sei predictions files and the corresponding `test_row_labels.txt` file. +- `sorted.test.chromatin_profile_diffs.tsv`: chromatin profile prediction TSV file (**Note:** output file will be compressed if input has >10000 variants), sorted by max absolute sequence class score. +- `sorted.test.sequence_class_scores.tsv`: sequence class prediction TSV file, sorted by max absolute sequence class scores. +- `test.sequence_class_scores.npy`: sequence class scores NPY file, note this is NOT sorted and will be ordered in the same way as `chromatin-profiles-hdf5/test_row_labels.txt` file. + +## Sequence classes Sequence classes are defined based on 30 million sequences tiling the genome and thus cover a wide range of sequence activities. To help interpretation, we grouped sequence classes into groups including P (Promoter), E (Enhancer), CTCF (CTCF-cohesin binding), TF (TF binding), PC (Polycomb-repressed), HET (Heterochromatin), TN (Transcription), and L (Low Signal) sequence classes. Please refer to our manuscript for a more detailed description of the sequence classes. @@ -111,14 +178,18 @@ Sequence classes are defined based on 30 million sequences tiling the genome and The configuration file and script for running train is under the `train` directory. To run Sei deep learning sequence model training, you will need GPU computing capability (we run training on 4x Tesla V100 GPUs connected with NVLink). -The training data is available [here](https://doi.org/10.5281/zenodo.4907037) should be downloaded and extracted into the `train` directory. **NOTE**: because the Sei training data contains processed files from the Cistrome Project, please first agree to the Cistrome Project [terms of usage](http://cistrome.org/db/#/bdown) before downloading the data: +The training data is available [here](https://doi.org/10.5281/zenodo.4907037) should be downloaded and extracted into the `train` directory. + +**NOTE**: because the Sei training data contains processed files from the Cistrome Project, please first agree to the Cistrome Project [terms of usage](http://cistrome.org/db/#/bdown) before downloading the data: ``` cd ./train sh ./download_data.sh # in the train directory ``` -The Sei training configuration YAML file is provided as the `train/train.yml` file. You can read more about the Selene command-line interface and configuration file formatting [here](https://selene.flatironinstitute.org/master/overview/cli.html#). You must use Selene version >0.5.0 to train this model ([release notes](https://github.com/FunctionLab/selene/blob/master/RELEASE_NOTES.md)). +The Sei training configuration YAML file is provided as the `train/train.yml` file. You can read more about the Selene command-line interface and configuration file formatting [here](https://selene.flatironinstitute.org/master/overview/cli.html#). + +You must use Selene version >0.5.0 to train this model ([release notes](https://github.com/FunctionLab/selene/blob/master/RELEASE_NOTES.md)). We also provide an example SLURM script `train.sh` for submitting a training job to a cluster. diff --git a/example_slurm_scripts/1_example_seqpred.slurm_gpu.sh b/example_slurm_scripts/1_example_seqpred.slurm_gpu.sh new file mode 100755 index 0000000..16d4f1a --- /dev/null +++ b/example_slurm_scripts/1_example_seqpred.slurm_gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +#SBATCH --time=1-00:00:00 +#SBATCH --gres=gpu:1 +#SBATCH --constraint=v100 +#SBATCH --partition=gpu +#SBATCH -n 1 +#SBATCH --mem 100G +#SBATCH --mail-type=FAIL +#SBATCH --mail-user= + +# Example SLURM script for running the Sei framework code +# on input sequences from a BED or FASTA file + +input_filepath="${1:-}" # path to input bed/fasta file +genome_version="${2:-}" # genome version +outdir="${3:-}" # path to output dir + +sh ./1_sequence_prediction.sh $input_filepath $genome_version $outdir --cuda diff --git a/run_pipeline.gpu_node.sh b/example_slurm_scripts/1_example_vep.slurm_gpu.sh similarity index 73% rename from run_pipeline.gpu_node.sh rename to example_slurm_scripts/1_example_vep.slurm_gpu.sh index 31dd7d5..3820efb 100755 --- a/run_pipeline.gpu_node.sh +++ b/example_slurm_scripts/1_example_vep.slurm_gpu.sh @@ -4,15 +4,15 @@ #SBATCH --constraint=v100 #SBATCH --partition=gpu #SBATCH -n 1 -#SBATCH --mem 50G +#SBATCH --mem 100G #SBATCH --mail-type=FAIL #SBATCH --mail-user= # Example SLURM script for running the Sei framework code +# on variants in the input VCF file vcf_filepath="${1:-}" # path to vcf file hg_version="${2:-}" # hg19 or hg38 outdir="${3:-}" # path to output dir -cuda="${4:-}" # --cuda flag -sh run_pipeline.sh $vcf_filepath $hg_version $outdir $cuda +sh ./1_variant_effect_prediction.sh $vcf_filepath $hg_version $outdir --cuda diff --git a/example_slurm_scripts/2_example_raw_sc.slurm_cpu.sh b/example_slurm_scripts/2_example_raw_sc.slurm_cpu.sh new file mode 100755 index 0000000..a88cb42 --- /dev/null +++ b/example_slurm_scripts/2_example_raw_sc.slurm_cpu.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --time=1-00:00:00 +#SBATCH --partition=ccb + +# Example SLURM script for getting the raw sequence class scores +# for input sequence predictions (i.e. no variants) + +input_preds="${1:-}" # input preds path +outdir="${2:-}" # output dir path + +sh ./2_raw_sc_score.sh $input_preds $outdir diff --git a/example_slurm_scripts/2_example_vep_sc.slurm_cpu.sh b/example_slurm_scripts/2_example_vep_sc.slurm_cpu.sh new file mode 100755 index 0000000..35e90a8 --- /dev/null +++ b/example_slurm_scripts/2_example_vep_sc.slurm_cpu.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --time=1-00:00:00 +#SBATCH --partition=ccb + + +# Example SLURM script for getting the variant effect sequence class scores + +ref_preds="${1:-}" # ref preds path +alt_preds="${2:-}" # alt preds path +outdir="${3:-}" # output dir path +no_tsv="${4:-}" # --no-tsv flag + +sh ./2_varianteffect_sc_score.sh $ref_preds $alt_preds $outdir $no_tsv diff --git a/images/logo.png b/images/logo.png new file mode 100644 index 0000000..4b21ccb Binary files /dev/null and b/images/logo.png differ diff --git a/model/sei_seq_prediction.yml b/model/sei_seq_prediction.yml new file mode 100755 index 0000000..1a78b7f --- /dev/null +++ b/model/sei_seq_prediction.yml @@ -0,0 +1,23 @@ +--- +ops: [analyze] +model: { + path: /model/sei.py, + class: Sei, + class_args: { + }, + non_strand_specific: mean +} +analyze_sequences: !obj:selene_sdk.predict.AnalyzeSequences { + sequence_length: 4096, + batch_size: 128, + trained_model_path: /model/sei.pth, + features: !obj:selene_sdk.utils.load_features_list { + input_path: /model/target.names + }, + write_mem_limit: 1000 +} +prediction: { + output_format: hdf5 +} +random_seed: 999 +... diff --git a/model/sei_prediction.yml b/model/sei_varianteffect_prediction.yml similarity index 88% rename from model/sei_prediction.yml rename to model/sei_varianteffect_prediction.yml index 0da8e2f..ae0b0bb 100755 --- a/model/sei_prediction.yml +++ b/model/sei_varianteffect_prediction.yml @@ -14,10 +14,10 @@ analyze_sequences: !obj:selene_sdk.predict.AnalyzeSequences { features: !obj:selene_sdk.utils.load_features_list { input_path: /model/target.names }, - write_mem_limit: 100000 + write_mem_limit: 1000 } variant_effect_prediction: { - save_data: [predictions], + save_data: [diffs, predictions], output_format: hdf5 } random_seed: 999 diff --git a/sequence_class.py b/sequence_class.py deleted file mode 100755 index af65891..0000000 --- a/sequence_class.py +++ /dev/null @@ -1,173 +0,0 @@ -""" -Description: - This script is run after `vep_cli.py`. It computes the sequence-class - level variant effect scores based on Sei chromatin profile predictions, - sorts the variants based on the maximum absolute scores across sequence classes - and outputs the results as TSV files. - -Usage: - sequence_class.py - sequence_class.py -h | --help - -Options: - The results directory. - Name of VCF file. - -""" -import os - -from docopt import docopt -import h5py -import numpy as np -import pandas as pd - - -def get_targets(filename): - targets = [] - with open(filename, 'r') as file_handle: - for line in file_handle: - targets.append(line.strip()) - return targets - - -def get_data(filename): - fh = h5py.File(filename, 'r') - data = fh["data"][()] - fh.close() - return data - - -def read_rowlabels_file(rowlabels, use_strand=False): - labels = [] - with open(rowlabels, 'r') as file_handle: - for line in file_handle: - if 'contains_unk' in line: - continue - cols = line.strip().split('\t') - chrom, pos, id, ref, alt, strand, ref_match, contains_unk = cols - if not use_strand: - strand = '.' - labels.append( - (chrom, pos, id, ref, alt, strand, - ref_match, contains_unk)) - return labels - - -def write_to_tsv(max_abs_diff, - chromatin_profile_diffs, - sequence_class_projscores, - chromatin_profiles, - seqclass_names, - rowlabels, - output_chromatin_profile_file, - output_sequence_class_file): - sorted_sc_abs_diff = np.sort(max_abs_diff)[::-1] - sorted_ixs = np.argsort(max_abs_diff)[::-1] - - assert len(sorted_ixs) == chromatin_profile_diffs.shape[0] - rowlabels = np.array(rowlabels)[sorted_ixs] - chromatin_profile_diffs = chromatin_profile_diffs[sorted_ixs, :] - sequence_class_projscores = sequence_class_projscores[sorted_ixs,:] - chromatin_profile_rows = [] - sequence_class_rows = [] - for ix, r in enumerate(rowlabels): - ref_match = r[-2] - contains_unk = r[-1] - label = r[:-2] - chromatin_profile_row = np.hstack( - [[sorted_sc_abs_diff[ix], ref_match, contains_unk], - label, - chromatin_profile_diffs[ix, :]]).tolist() - sequence_class_row = np.hstack( - [[sorted_sc_abs_diff[ix], ref_match, contains_unk], - label, - sequence_class_projscores[ix, :]]).tolist() - chromatin_profile_rows.append(chromatin_profile_row) - sequence_class_rows.append(sequence_class_row) - - del chromatin_profile_diffs - colnames = [ - 'seqclass_max_absdiff', 'ref_match', 'contains_unk', - 'chrom', 'pos', 'id', 'ref', 'alt', 'strand'] - - sc_df = pd.DataFrame( - sequence_class_rows, columns=colnames + seqclass_names) - check_columns = sc_df.columns.tolist() - check_columns.remove('strand') - sc_df.drop_duplicates(subset=check_columns, inplace=True) - sc_df.to_csv(output_sequence_class_file, sep='\t', index=False) - - cp_df = pd.DataFrame( - chromatin_profile_rows, columns=colnames + chromatin_profiles) - check_columns = cp_df.columns.tolist() - check_columns.remove('strand') - cp_df.drop_duplicates(subset=check_columns, inplace=True) - if len(cp_df) > 10000: - cp_df.to_csv( - output_chromatin_profile_file, sep='\t', index=False, compression='gzip') - else: - cp_df.to_csv( - output_chromatin_profile_file, sep='\t', index=False) - - - -if __name__ == "__main__": - arguments = docopt( - __doc__, - version='1.0.0') - results_dir = arguments[''] - input_vcf = arguments[''] - - _, filename = os.path.split(input_vcf) - filename_prefix = '.'.join(filename.split('.')[:-1]) - - sei_dir = "./model" - chromatin_profiles = get_targets(os.path.join(sei_dir, "target.names")) - seqclass_names = get_targets(os.path.join(sei_dir, "seqclass.names")) - - profile_pred_dir = os.path.join(results_dir, 'chromatin-profiles-hdf5') - rowlabels_filename = "{0}_row_labels.txt".format(filename_prefix) - chromatin_profile_rowlabels = os.path.join(profile_pred_dir, rowlabels_filename) - - chromatin_profile_ref = get_data(os.path.join( - profile_pred_dir, - "{0}.ref_predictions.h5".format(filename_prefix))) - chromatin_profile_alt = get_data(os.path.join( - profile_pred_dir, - "{0}.alt_predictions.h5".format(filename_prefix))) - - clustervfeat = np.load(os.path.join(sei_dir, 'projvec_targets.npy')) - histone_inds = np.load(os.path.join(sei_dir, 'histone_inds.npy')) - - chromatin_profile_ref_adjust = chromatin_profile_ref.copy() - chromatin_profile_ref_adjust[:, histone_inds] = \ - chromatin_profile_ref_adjust[:, histone_inds] * ( - (np.sum(chromatin_profile_ref[:, histone_inds], axis=1)*0.5 + - np.sum(chromatin_profile_alt[:, histone_inds], axis=1)*0.5) / - np.sum(chromatin_profile_ref[:, histone_inds], axis=1))[:, None] - - chromatin_profile_alt_adjust = chromatin_profile_alt.copy() - chromatin_profile_alt_adjust[:, histone_inds] = \ - chromatin_profile_alt_adjust[:, histone_inds] * ( - (np.sum(chromatin_profile_ref[:, histone_inds], axis=1)*0.5 + - np.sum(chromatin_profile_alt[:, histone_inds], axis=1)*0.5) / - np.sum(chromatin_profile_alt[:, histone_inds], axis=1))[:, None] - - refproj = (np.dot(chromatin_profile_ref_adjust,clustervfeat.T) / - np.linalg.norm(clustervfeat,axis=1)) - altproj = (np.dot(chromatin_profile_alt_adjust,clustervfeat.T) / - np.linalg.norm(clustervfeat,axis=1)) - diffproj = altproj[:,:40] - refproj[:,:40] - - max_abs_diff = np.abs(diffproj).max(axis=1) - - write_to_tsv(max_abs_diff, # max sequence class score - chromatin_profile_alt - chromatin_profile_ref, # chromatin profile diffs - diffproj, # sequence class diffs - chromatin_profiles, # chromatin profile targets - seqclass_names, # sequence class names - read_rowlabels_file(chromatin_profile_rowlabels, use_strand=False), - os.path.join(results_dir, "chromatin_profile_diffs.tsv"), - os.path.join(results_dir, "sequence_class_scores.tsv")) - - diff --git a/utils.py b/utils.py new file mode 100755 index 0000000..11fa3fd --- /dev/null +++ b/utils.py @@ -0,0 +1,110 @@ +import os + +import h5py +import numpy as np +import pandas as pd + + +def get_targets(filename): + """ + Load the names of all the chromatin profiles predicted by Sei + """ + targets = [] + with open(filename, 'r') as file_handle: + for line in file_handle: + targets.append(line.strip()) + return targets + + +def get_data(filename): + """ + Load HDF5 file of predictions into memory + """ + fh = h5py.File(filename, 'r') + data = fh["data"][()] + fh.close() + return data + + +def sc_projection(chromatin_profile_preds, clustervfeat): + return (np.dot(chromatin_profile_preds, clustervfeat.T) / + np.linalg.norm(clustervfeat, axis=1)) + + +def sc_hnorm_varianteffect(chromatin_profile_ref, chromatin_profile_alt, clustervfeat, histone_inds): + chromatin_profile_ref_adjust = chromatin_profile_ref.copy() + chromatin_profile_ref_adjust[:, histone_inds] = \ + chromatin_profile_ref_adjust[:, histone_inds] * ( + (np.sum(chromatin_profile_ref[:, histone_inds], axis=1)*0.5 + + np.sum(chromatin_profile_alt[:, histone_inds], axis=1)*0.5) / + np.sum(chromatin_profile_ref[:, histone_inds], axis=1))[:, None] + + chromatin_profile_alt_adjust = chromatin_profile_alt.copy() + chromatin_profile_alt_adjust[:, histone_inds] = \ + chromatin_profile_alt_adjust[:, histone_inds] * ( + (np.sum(chromatin_profile_ref[:, histone_inds], axis=1)*0.5 + + np.sum(chromatin_profile_alt[:, histone_inds], axis=1)*0.5) / + np.sum(chromatin_profile_alt[:, histone_inds], axis=1))[:, None] + + refproj = sc_projection(chromatin_profile_ref_adjust, clustervfeat) + altproj = sc_projection(chromatin_profile_alt_adjust, clustervfeat) + diffproj = altproj[:,:40] - refproj[:,:40] + return diffproj + + +def get_filename_prefix(filename): + """Filename must follow Selene output file conventions. + """ + prefix = None + if '.alt_predictions' in filename: + prefix = filename.split('.alt_predictions')[0] + elif '.ref_predictions' in filename: + prefix = filename.split('.ref_predictions')[0] + else: + prefix = filename.split('_predictions')[0] + return prefix + + +def write_to_tsv(max_abs_diff, + chromatin_profile_diffs, + sequence_class_projscores, + chromatin_profiles, + seqclass_names, + rowlabels, + output_chromatin_profile_file, + output_sequence_class_file): + sorted_sc_abs_diff = np.sort(max_abs_diff)[::-1] + sorted_maxsc_df = pd.DataFrame(sorted_sc_abs_diff, # dataframe + columns=['seqclass_max_absdiff']) + + sorted_ixs = np.argsort(max_abs_diff)[::-1] + assert len(sorted_ixs) == chromatin_profile_diffs.shape[0] + sorted_rowlabels = rowlabels.iloc[sorted_ixs] # dataframe + sorted_rowlabels.reset_index(inplace=True) + rowlabel_columns = sorted_rowlabels.columns.tolist() + + # sorted now + chromatin_profile_diffs = chromatin_profile_diffs[sorted_ixs, :] + sequence_class_projscores = sequence_class_projscores[sorted_ixs,:] + + # dataframes + sorted_profiles_df = pd.DataFrame(chromatin_profile_diffs, columns=chromatin_profiles) + sorted_sc_df = pd.DataFrame(sequence_class_projscores, columns=seqclass_names) + del chromatin_profile_diffs + + sei_df = pd.concat([sorted_maxsc_df, sorted_rowlabels, sorted_profiles_df], + axis=1) + sc_df = pd.concat([sorted_maxsc_df, sorted_rowlabels, sorted_sc_df], + axis=1) + sc_df[['seqclass_max_absdiff'] + rowlabel_columns + seqclass_names].to_csv( + output_sequence_class_file, sep='\t', index=False) + + if len(sei_df) > 10000: + sei_df[['seqclass_max_absdiff'] + rowlabel_columns + chromatin_profiles].to_csv( + output_chromatin_profile_file, sep='\t', index=False, compression='gzip') + else: + sei_df[['seqclass_max_absdiff'] + rowlabel_columns + chromatin_profiles].to_csv( + output_chromatin_profile_file, sep='\t', index=False) + + +