From 7082e70e4949d4d4ec8968baeecf4ead52dc532a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryam7@users.noreply.github.com> Date: Thu, 29 Apr 2021 11:21:23 -0700 Subject: [PATCH 1/5] temporarily handle 2vcf exit code (see #28) --- rules/classify.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/classify.smk b/rules/classify.smk index 49f542d..748d1f6 100644 --- a/rules/classify.smk +++ b/rules/classify.smk @@ -242,7 +242,7 @@ rule tsv2vcf: output: temp(config['out']+"/{sample}/results.vcf.gz") conda: "../envs/prc.yml" shell: - "zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results}" + "zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results} || true" rule fix_vcf_header: """ add contigs to the header of the vcf """ From 86e1453f74d46cfba8cff7bf0f4fdbbf59311604 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryam7@users.noreply.github.com> Date: Thu, 29 Apr 2021 11:22:17 -0700 Subject: [PATCH 2/5] handle pvals of 1 in random forest output when creating VCF --- scripts/2vcf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/2vcf.py b/scripts/2vcf.py index d7f19ad..32559ab 100755 --- a/scripts/2vcf.py +++ b/scripts/2vcf.py @@ -14,8 +14,12 @@ def phred(vals): """ apply the phred scale to the vals provided """ - return -10*np.log10(1-vals) - return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30 + with np.errstate(divide='raise'): + try: + return -10*np.log10(1-vals) + except FloatingPointError: + return np.float64(30) + return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30 def plot_line(lst, show_discards=False): plt.clf() From 753f81aee816a3396a657a8fd15b7069c0bd95e8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryam7@users.noreply.github.com> Date: Thu, 29 Apr 2021 11:28:34 -0700 Subject: [PATCH 3/5] add a script for analyzing allele conflicts --- scripts/README.md | 3 +++ scripts/allele_conflicts.bash | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100755 scripts/allele_conflicts.bash diff --git a/scripts/README.md b/scripts/README.md index 5547750..aaf5afd 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -7,6 +7,9 @@ All python scripts implement the `--help` argument. For bash, R, and awk scripts ### [2vcf.py](2vcf.py) A python script that uses files from the `prepare` and `classify` pipelines to create a VCF with the final, predicted variants. This script also has a special internal mode, which can be used for recalibrating the QUAL scores output in the VCF. +### [allele_conflicts.bash](allele_conflicts.bash) +A bash script for identifying sites at which the variant callers in our ensemble outputted conflicting alleles. + ### [cgrep.bash](cgrep.bash) A bash script for extracting columns from TSVs via `grep`. Every argument besides the first is passed directly to `grep`. diff --git a/scripts/allele_conflicts.bash b/scripts/allele_conflicts.bash new file mode 100755 index 0000000..a497fe9 --- /dev/null +++ b/scripts/allele_conflicts.bash @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele. +# Note that this script will only work for the chosen subset of variant callers. + +# arg1: merge.tsv.gz file +# arg2: results.tsv.gz file +# arg3: 'snp' or 'indel' + +# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel + +if [ "$3" = 'indel' ]; then + zcat "$1" | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 | \ + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( + zcat "$2" | \ + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 + ) +else + zcat "$1" | \ + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \ + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \ + tail -n+2 | \ + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 | \ + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( + zcat "$2" | \ + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ + sed 's/\t/,/' | \ + LC_ALL=C sort -t $'\t' -k1,1 + ) +fi | \ +sed 's/,/\t/' | \ +LC_ALL=C sort -t $'\t' -k1,1V -k2,2n + +# To make a table containing alternative alleles for the following rules (as columns) +# 1) variant caller priority +# 2) majority rule +# 3) platinum genomes +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less From af69606981c99424d01dc6a5afa3ccd0e19173c1 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryam7@users.noreply.github.com> Date: Fri, 2 Jul 2021 13:10:00 -0700 Subject: [PATCH 4/5] subset columns before import in 2vcf.py; also resolve #28 --- rules/classify.smk | 2 +- rules/prepare.smk | 2 +- scripts/2vcf.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/rules/classify.smk b/rules/classify.smk index 748d1f6..41eee60 100644 --- a/rules/classify.smk +++ b/rules/classify.smk @@ -242,7 +242,7 @@ rule tsv2vcf: output: temp(config['out']+"/{sample}/results.vcf.gz") conda: "../envs/prc.yml" shell: - "zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results} || true" + "scripts/2vcf.py -o {output} {params.callers} {input.results} {input.merge}" rule fix_vcf_header: """ add contigs to the header of the vcf """ diff --git a/rules/prepare.smk b/rules/prepare.smk index ce5f93d..f5f0c1b 100644 --- a/rules/prepare.smk +++ b/rules/prepare.smk @@ -144,7 +144,7 @@ rule bed_peaks: # to convert to BED, we must extract the first three columns (chr, start, stop) "cut -f -3 \"{input.peaks}\" | " "bedtools getfasta -fi {input.ref} -bedOut -bed stdin | " - "sort -t $'\t' -k1,1V -k2,2n > \"{output}\" && " + "sort -t $'\\t' -k1,1V -k2,2n > \"{output}\" && " "test -s \"{output}\"" diff --git a/scripts/2vcf.py b/scripts/2vcf.py index 32559ab..64afddb 100755 --- a/scripts/2vcf.py +++ b/scripts/2vcf.py @@ -218,7 +218,8 @@ def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=Fa prepared = get_calls( pd.read_csv( prepared, sep="\t", header=0, index_col=["CHROM", "POS"], - dtype=str, chunksize=cs, na_values="." + dtype=str, chunksize=cs, na_values=".", + usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT') ), callers, pretty ) # flush out the first item in the generator: the vartype @@ -338,6 +339,8 @@ def write_vcf(out, records): callers = args.callers.split(",") if not args.internal: + import matplotlib + matplotlib.use('Agg') vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type)) else: if not sys.flags.interactive: From abfc6b77716cf12f20bda000e124ce5558f0e07b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryam7@users.noreply.github.com> Date: Fri, 2 Jul 2021 13:22:36 -0700 Subject: [PATCH 5/5] add header to allele conflicts script --- scripts/allele_conflicts.bash | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/allele_conflicts.bash b/scripts/allele_conflicts.bash index a497fe9..a794998 100755 --- a/scripts/allele_conflicts.bash +++ b/scripts/allele_conflicts.bash @@ -10,6 +10,7 @@ # Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel if [ "$3" = 'indel' ]; then +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca" zcat "$1" | \ scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \ awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \ @@ -24,6 +25,7 @@ if [ "$3" = 'indel' ]; then LC_ALL=C sort -t $'\t' -k1,1 ) else +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca" zcat "$1" | \ scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \ awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \ @@ -42,8 +44,8 @@ sed 's/,/\t/' | \ LC_ALL=C sort -t $'\t' -k1,1V -k2,2n # To make a table containing alternative alleles for the following rules (as columns) -# 1) variant caller priority +# 1) caller priority rule # 2) majority rule # 3) platinum genomes -# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less -# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less