Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rules/classify.smk
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ rule tsv2vcf:
output: temp(config['out']+"/{sample}/results.vcf.gz")
conda: "../envs/prc.yml"
shell:
"zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results}"
"scripts/2vcf.py -o {output} {params.callers} {input.results} {input.merge}"

rule fix_vcf_header:
""" add contigs to the header of the vcf """
Expand Down
2 changes: 1 addition & 1 deletion rules/prepare.smk
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ rule bed_peaks:
# to convert to BED, we must extract the first three columns (chr, start, stop)
"cut -f -3 \"{input.peaks}\" | "
"bedtools getfasta -fi {input.ref} -bedOut -bed stdin | "
"sort -t $'\t' -k1,1V -k2,2n > \"{output}\" && "
"sort -t $'\\t' -k1,1V -k2,2n > \"{output}\" && "
"test -s \"{output}\""


Expand Down
13 changes: 10 additions & 3 deletions scripts/2vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@

def phred(vals):
""" apply the phred scale to the vals provided """
return -10*np.log10(1-vals)
return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30
with np.errstate(divide='raise'):
try:
return -10*np.log10(1-vals)
except FloatingPointError:
return np.float64(30)
return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30

def plot_line(lst, show_discards=False):
plt.clf()
Expand Down Expand Up @@ -214,7 +218,8 @@ def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=Fa
prepared = get_calls(
pd.read_csv(
prepared, sep="\t", header=0, index_col=["CHROM", "POS"],
dtype=str, chunksize=cs, na_values="."
dtype=str, chunksize=cs, na_values=".",
usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT')
), callers, pretty
)
# flush out the first item in the generator: the vartype
Expand Down Expand Up @@ -334,6 +339,8 @@ def write_vcf(out, records):
callers = args.callers.split(",")

if not args.internal:
import matplotlib
matplotlib.use('Agg')
vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type))
else:
if not sys.flags.interactive:
Expand Down
3 changes: 3 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ All python scripts implement the `--help` argument. For bash, R, and awk scripts
### [2vcf.py](2vcf.py)
A python script that uses files from the `prepare` and `classify` pipelines to create a VCF with the final, predicted variants. This script also has a special internal mode, which can be used for recalibrating the QUAL scores output in the VCF.

### [allele_conflicts.bash](allele_conflicts.bash)
A bash script for identifying sites at which the variant callers in our ensemble outputted conflicting alleles.

### [cgrep.bash](cgrep.bash)
A bash script for extracting columns from TSVs via `grep`. Every argument besides the first is passed directly to `grep`.

Expand Down
51 changes: 51 additions & 0 deletions scripts/allele_conflicts.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele.
# Note that this script will only work for the chosen subset of variant callers.

# arg1: merge.tsv.gz file
# arg2: results.tsv.gz file
# arg3: 'snp' or 'indel'

# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel

if [ "$3" = 'indel' ]; then
# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca"
zcat "$1" | \
scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \
awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \
tail -n+2 | \
awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
sed 's/\t/,/' | \
LC_ALL=C sort -t $'\t' -k1,1 | \
LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
zcat "$2" | \
awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
sed 's/\t/,/' | \
LC_ALL=C sort -t $'\t' -k1,1
)
else
# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca"
zcat "$1" | \
scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \
awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \
tail -n+2 | \
awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
sed 's/\t/,/' | \
LC_ALL=C sort -t $'\t' -k1,1 | \
LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
zcat "$2" | \
awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
sed 's/\t/,/' | \
LC_ALL=C sort -t $'\t' -k1,1
)
fi | \
sed 's/,/\t/' | \
LC_ALL=C sort -t $'\t' -k1,1V -k2,2n

# To make a table containing alternative alleles for the following rules (as columns)
# 1) caller priority rule
# 2) majority rule
# 3) platinum genomes
# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less
# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less