mcvickerlab · aryarm · Jul 3, 2021 · Apr 29, 2021 · Apr 29, 2021 · Apr 29, 2021
diff --git a/rules/classify.smk b/rules/classify.smk
@@ -242,7 +242,7 @@ rule tsv2vcf:
     output: temp(config['out']+"/{sample}/results.vcf.gz")
     conda: "../envs/prc.yml"
     shell:
-        "zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results}"
+        "scripts/2vcf.py -o {output} {params.callers} {input.results} {input.merge}"
 
 rule fix_vcf_header:
     """ add contigs to the header of the vcf """

diff --git a/rules/prepare.smk b/rules/prepare.smk
@@ -144,7 +144,7 @@ rule bed_peaks:
         # to convert to BED, we must extract the first three columns (chr, start, stop)
         "cut -f -3 \"{input.peaks}\" | "
         "bedtools getfasta -fi {input.ref} -bedOut -bed stdin | "
-        "sort -t $'\t' -k1,1V -k2,2n > \"{output}\" && "
+        "sort -t $'\\t' -k1,1V -k2,2n > \"{output}\" && "
         "test -s \"{output}\""
 
 

diff --git a/scripts/2vcf.py b/scripts/2vcf.py
@@ -14,8 +14,12 @@
 
 def phred(vals):
     """ apply the phred scale to the vals provided """
-    return -10*np.log10(1-vals)
-    return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30
+    with np.errstate(divide='raise'):
+        try:
+            return -10*np.log10(1-vals)
+        except FloatingPointError:
+            return np.float64(30)
+            return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30
 
 def plot_line(lst, show_discards=False):
     plt.clf()
@@ -214,7 +218,8 @@ def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=Fa
     prepared = get_calls(
         pd.read_csv(
             prepared, sep="\t", header=0, index_col=["CHROM", "POS"],
-            dtype=str, chunksize=cs, na_values="."
+            dtype=str, chunksize=cs, na_values=".",
+            usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT')
         ), callers, pretty
     )
     # flush out the first item in the generator: the vartype
@@ -334,6 +339,8 @@ def write_vcf(out, records):
         callers = args.callers.split(",")
 
     if not args.internal:
+        import matplotlib
+        matplotlib.use('Agg')
         vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type))
     else:
         if not sys.flags.interactive:

diff --git a/scripts/README.md b/scripts/README.md
@@ -7,6 +7,9 @@ All python scripts implement the `--help` argument. For bash, R, and awk scripts
 ### [2vcf.py](2vcf.py)
 A python script that uses files from the `prepare` and `classify` pipelines to create a VCF with the final, predicted variants. This script also has a special internal mode, which can be used for recalibrating the QUAL scores output in the VCF.
 
+### [allele_conflicts.bash](allele_conflicts.bash)
+A bash script for identifying sites at which the variant callers in our ensemble outputted conflicting alleles.
+
 ### [cgrep.bash](cgrep.bash)
 A bash script for extracting columns from TSVs via `grep`. Every argument besides the first is passed directly to `grep`.
 

diff --git a/scripts/allele_conflicts.bash b/scripts/allele_conflicts.bash
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele.
+# Note that this script will only work for the chosen subset of variant callers.
+
+# arg1: merge.tsv.gz file
+# arg2: results.tsv.gz file
+# arg3: 'snp' or 'indel'
+
+# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel
+
+if [ "$3" = 'indel' ]; then
+#    echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca"
+    zcat "$1" | \
+    scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \
+    awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \
+    tail -n+2 | \
+    awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
+    sed 's/\t/,/' | \
+    LC_ALL=C sort -t $'\t' -k1,1 | \
+    LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
+        zcat "$2" | \
+        awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
+        sed 's/\t/,/' | \
+        LC_ALL=C sort -t $'\t' -k1,1
+    )
+else
+#    echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca"
+    zcat "$1" | \
+    scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \
+    awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \
+    tail -n+2 | \
+    awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
+    sed 's/\t/,/' | \
+    LC_ALL=C sort -t $'\t' -k1,1 | \
+    LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
+        zcat "$2" | \
+        awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
+        sed 's/\t/,/' | \
+        LC_ALL=C sort -t $'\t' -k1,1
+    )
+fi | \
+sed 's/,/\t/' | \
+LC_ALL=C sort -t $'\t' -k1,1V -k2,2n
+
+# To make a table containing alternative alleles for the following rules (as columns)
+# 1) caller priority rule
+# 2) majority rule
+# 3) platinum genomes
+# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less
+# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less