Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions bin/GToTree
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[0;33m'
NC='\033[0m'
VERSION="v1.8.9"
VERSION="v1.8.10"

if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
printf "GToTree ${VERSION}\n"
Expand Down Expand Up @@ -281,6 +281,7 @@ keep_individual_alignments='false'
tree_program='FastTreeMP'
additional_pfam_targets='false'
ko_targets='false'
target_KOs='none'
override_faster_alignment='false'
http_flag='false'
nucleotide='false'
Expand Down Expand Up @@ -408,7 +409,7 @@ if [ -f "$NCBI_acc_file" ]; then
fi

### if KO targets were provided, checking that KO data is present already, and downloading if it isn't
if [ "$target_KOs" != "" ]; then
if [ "$target_KOs" != "none" ]; then
gtt-get-kofamscan-data
fi

Expand Down Expand Up @@ -1098,7 +1099,7 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
[ $num_jobs != "1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ] || \
[ $tree_program != "FastTreeMP" ] || [ $wanting_but_missing_FastTreeMP == "true" ] || \
[ $align_only == 'true' ] || [ $keep_individual_alignments == 'true' ] || \
[ ! -z $target_pfams ] || [ ! -z $target_KOs ] || [ $nucleotide != 'false' ]; then
[ ! -z $target_pfams ] || [ $target_KOs != "none" ] || [ $nucleotide != 'false' ]; then

if [ "$file_to_genome_id_map" != "" ]; then
if [ ! -s $file_to_genome_id_map ]; then
Expand Down Expand Up @@ -1198,7 +1199,7 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
fi

#### checking and reporting if KOs targeted ####
if [ "$target_KOs" != "" ]; then
if [ "$target_KOs" != "none" ]; then

if [ -s "$target_KOs" ]; then
uniq ${target_KOs} > uniq_ko_targets.tmp
Expand Down Expand Up @@ -1405,6 +1406,14 @@ if [ $? -ne 0 ] ; then
exit
fi

# adding place to store ncbi downloads if needed
if [ ${debug_flag} == 'true' ] && [ -n "$NCBI_acc_file" ]; then
mkdir -p ${tmp_dir}/ncbi-downloads
fi

#############################################################################
########################### SOME LAST ADJUSTMENTS #########################
#############################################################################

### making sure each id provided in the mapping file (if given) is found in one
### of the genome input files
Expand Down Expand Up @@ -1617,15 +1626,16 @@ if [ -n "$NCBI_acc_file" ]; then
### running in parallel if set, otherwise running in serial ###
if [ $num_jobs == "1" ]; then
if [ "$nucleotide" == 'false' ]; then
gtt-ncbi-serial.sh ${tmp_dir}/ncbi_accessions_info.tmp $tmp_dir $hmm_file $NCBI_remaining_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
gtt-ncbi-serial.sh "${tmp_dir}/ncbi_accessions_info.tmp" "$tmp_dir" "$hmm_file" "$NCBI_remaining_genomes_total" "$num_cpus" "$hmm_target_genes_total" "$output_dir" "$best_hit_mode" "$additional_pfam_targets" "$http_flag" "${ko_targets}" "${target_KOs}" "${debug_flag}"
else
gtt-ncbi-serial-nt.sh ${tmp_dir}/ncbi_accessions_info.tmp $tmp_dir $hmm_file $NCBI_remaining_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
gtt-ncbi-serial-nt.sh "${tmp_dir}/ncbi_accessions_info.tmp" "$tmp_dir" "$hmm_file" "$NCBI_remaining_genomes_total" "$num_cpus" "$hmm_target_genes_total" "$output_dir" "$best_hit_mode" "$additional_pfam_targets" "$http_flag" "${ko_targets}" "${target_KOs}" "${debug_flag}"
fi
else
if [ "$nucleotide" == 'false' ]; then
cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel -j $num_jobs gtt-ncbi-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
printf "\n\ndebug flag is $debug_flag\n\n"
cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel --quote -j "$num_jobs" -- gtt-ncbi-parallel.sh "{}" "$tmp_dir" "$hmm_file" "$num_cpus" "$hmm_target_genes_total" "$output_dir" "$best_hit_mode" "$additional_pfam_targets" "$http_flag" "${ko_targets}" "${target_KOs}" "${debug_flag}"
else
cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel -j $num_jobs gtt-ncbi-parallel-nt.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets $http_flag ${ko_targets} ${target_KOs}
cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel --quote -j "$num_jobs" -- gtt-ncbi-parallel-nt.sh "{}" "$tmp_dir" "$hmm_file" "$num_cpus" "$hmm_target_genes_total" "$output_dir" "$best_hit_mode" "$additional_pfam_targets" "$http_flag" "${ko_targets}" "${target_KOs}" "${debug_flag}"
fi
fi

Expand Down Expand Up @@ -3668,24 +3678,24 @@ if [ -f ${output_dir}/Redundant_input_accessions.txt ] || [ -f ${output_dir}/NCB

if [ -f ${output_dir}/Redundant_input_accessions.txt ]; then
printf " $num_dupe_report accession(s) redundant.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/Redundant_input_accessions.txt ${output_dir}/run_files/Redundant_input_accessions.txt
mv ${output_dir}/Redundant_input_accessions.txt ${output_dir}/run_files/Redundant_input_accessions.txt
fi

if [ -f ${output_dir}/NCBI_accessions_not_found.txt ]; then
num_accs_not_found=$(wc -l ${output_dir}/NCBI_accessions_not_found.txt | sed "s/^ *//" | cut -d " " -f 1)
printf " $num_accs_not_found accession(s) not successfully found at NCBI.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/NCBI_accessions_not_found.txt ${output_dir}/run_files/NCBI_accessions_not_found.txt
mv ${output_dir}/NCBI_accessions_not_found.txt ${output_dir}/run_files/NCBI_accessions_not_found.txt
fi

if [ -f ${output_dir}/NCBI_accessions_not_downloaded.txt ]; then
num_accs_not_downloaded=$(wc -l ${output_dir}/NCBI_accessions_not_downloaded.txt | sed "s/^ *//" | cut -d " " -f 1)
printf " $num_accs_not_downloaded did not download properly.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/NCBI_accessions_not_downloaded.txt ${output_dir}/run_files/NCBI_accessions_not_downloaded.txt
mv ${output_dir}/NCBI_accessions_not_downloaded.txt ${output_dir}/run_files/NCBI_accessions_not_downloaded.txt
fi

if [ -f ${output_dir}/Genomes_removed_for_too_few_hits.tsv ]; then
printf " $removed_genomes genome(s) removed due to having too few hits to the targeted SCGs.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/Genomes_removed_for_too_few_hits.tsv ${output_dir}/run_files/Genomes_removed_for_too_few_hits.tsv
mv ${output_dir}/Genomes_removed_for_too_few_hits.tsv ${output_dir}/run_files/Genomes_removed_for_too_few_hits.tsv
fi

# reporting of unsucessful targets depends on if run in best-hit mode (-B) or not
Expand All @@ -3699,21 +3709,21 @@ if [ -f ${output_dir}/Redundant_input_accessions.txt ] || [ -f ${output_dir}/NCB

if [ -f ${output_dir}/Target_genes_not_found.txt ]; then
printf " $removed_genes gene(s) had no hits in any genome.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/Target_genes_not_found.txt ${output_dir}/run_files/Target_genes_not_found.txt
mv ${output_dir}/Target_genes_not_found.txt ${output_dir}/run_files/Target_genes_not_found.txt
fi

fi


if [ -f ${output_dir}/Genes_with_no_hits_after_length_filter.txt ]; then
printf " $removed_genes2 gene(s) had no hits after filtering by length.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/Genes_with_no_hits_after_length_filter.txt ${output_dir}/run_files/Genes_with_no_hits_after_length_filter.txt
mv ${output_dir}/Genes_with_no_hits_after_length_filter.txt ${output_dir}/run_files/Genes_with_no_hits_after_length_filter.txt
fi

if [ -f ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ]; then
num_genomes_high_redund=$(tail -n +2 ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv | wc -l | sed "s/^ *//" | cut -d " " -f 1)
printf " $num_genomes_high_redund genome(s) had an estimated redundancy of >= 10%%.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
cp ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ${output_dir}/run_files/Genomes_with_questionable_redundancy_estimates.tsv
mv ${output_dir}/Genomes_with_questionable_redundancy_estimates.tsv ${output_dir}/run_files/Genomes_with_questionable_redundancy_estimates.tsv
fi

printf "\n Reported along with additional informative run files in:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
Expand Down
39 changes: 26 additions & 13 deletions bin/gtt-ncbi-parallel-nt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@ RED='\033[0;31m'
ORANGE='\033[0;33m'
NC='\033[0m'

tmp_dir=$2
hmm_file=$3
num_cpus=$4
hmm_target_genes_total=$5
output_dir=$6
best_hit_mode=$7
additional_pfam_targets=$8
http_flag=$9
ko_targets=${10}
target_KOs=${11}
tmp_dir="$2"
hmm_file="$3"
num_cpus="$4"
hmm_target_genes_total="$5"
output_dir="$6"
best_hit_mode="$7"
additional_pfam_targets="$8"
http_flag="$9"
ko_targets="${10}"
target_KOs="${11}"
debug_flag="${12}"

assembly=$(echo "$1" | cut -f 1)
downloaded_accession=$(echo "$1" | cut -f 2)
Expand All @@ -38,12 +39,12 @@ if [ $base_link == "na" ] || [ -z $base_link ]; then
fi

# checking if GCF or GCA
if [[ $assembly == "GCF"* ]]; then
if [[ $assembly == "GCF"* ]]; then
p2="GCF"
else
p2="GCA"
fi

p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
Expand Down Expand Up @@ -206,9 +207,21 @@ if $(file ${tmp_dir}/${assembly}_genome.tmp.gz | grep -q gzip); then

fi

if [ $debug_flag == "true" ]; then
if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then
mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
fi
if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then
mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa
fi
if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
fi
fi

rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp

else
Expand Down
66 changes: 53 additions & 13 deletions bin/gtt-ncbi-parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,42 @@ RED='\033[0;31m'
ORANGE='\033[0;33m'
NC='\033[0m'

tmp_dir=$2
hmm_file=$3
num_cpus=$4
hmm_target_genes_total=$5
output_dir=$6
best_hit_mode=$7
additional_pfam_targets=$8
http_flag=$9
ko_targets=${10}
target_KOs=${11}
tmp_dir="$2"
hmm_file="$3"
num_cpus="$4"
hmm_target_genes_total="$5"
output_dir="$6"
best_hit_mode="$7"
additional_pfam_targets="$8"
http_flag="$9"
ko_targets="${10}"
target_KOs="${11}"
debug_flag="${12}"

printf "\n\ntmp_dir: $tmp_dir\n"
printf "hmm_file: $hmm_file\n"
printf "num_cpus: $num_cpus\n"
printf "hmm_target_genes_total: $hmm_target_genes_total\n"
printf "output_dir: $output_dir\n"
printf "best_hit_mode: $best_hit_mode\n"
printf "additional_pfam_targets: $additional_pfam_targets\n"
printf "http_flag: $http_flag\n"
printf "ko_targets: $ko_targets\n"
printf "target_KOs: $target_KOs\n"
printf "debug_flag: $debug_flag\n\n"

printf "\n\narg 2: $2\n"
printf "arg 3: $3\n"
printf "arg 4: $4\n"
printf "arg 5: $5\n"
printf "arg 6: $6\n"
printf "arg 7: $7\n"
printf "arg 8: $8\n"
printf "arg 9: $9\n"
printf "arg 10: ${10}\n"
printf "arg 11: ${11}\n"
printf "arg 12: ${12}\n\n"


assembly=$(echo "$1" | cut -f 1)
downloaded_accession=$(echo "$1" | cut -f 2)
Expand All @@ -38,12 +64,12 @@ if [ $base_link == "na" ] || [ -z $base_link ]; then
fi

# checking if GCF or GCA
if [[ $assembly == "GCF"* ]]; then
if [[ $assembly == "GCF"* ]]; then
p2="GCF"
else
p2="GCA"
fi

p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
Expand Down Expand Up @@ -213,9 +239,23 @@ if [ -s ${tmp_dir}/${assembly}_genes3.tmp ]; then

fi

printf "\n\n ${debug_flag}\n\n"

if [ $debug_flag == "true" ]; then
printf "\n\n Debug mode on, keeping intermediate files.\n\n"
if [ -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
printf " Keeping ${tmp_dir}/${assembly}_genes2.tmp\n"
mv ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
fi
if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
printf " Keeping ${tmp_dir}/${assembly}_genome.tmp\n"
mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
fi
fi

rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes1.tmp
rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
rm -rf ${tmp_dir}/${assembly}_genes.tmp.ssi ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_genes.tmp.ssi ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp

else
Expand Down
43 changes: 28 additions & 15 deletions bin/gtt-ncbi-serial-nt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@ RED='\033[0;31m'
ORANGE='\033[0;33m'
NC='\033[0m'

tmp_dir=$2
hmm_file=$3
NCBI_remaining_genomes_total=$4
num_cpus=$5
hmm_target_genes_total=$6
output_dir=$7
best_hit_mode=$8
additional_pfam_targets=$9
http_flag=${10}
ko_targets=${11}
target_KOs=${12}
tmp_dir="$2"
hmm_file="$3"
NCBI_remaining_genomes_total="$4"
num_cpus="$5"
hmm_target_genes_total="$6"
output_dir="$7"
best_hit_mode="$8"
additional_pfam_targets="$9"
http_flag="${10}"
ko_targets="${11}"
target_KOs="${12}"
debug_flag="${13}"

num=0

Expand Down Expand Up @@ -50,12 +51,12 @@ do
fi

# checking if GCF or GCA
if [[ $assembly == "GCF"* ]]; then
if [[ $assembly == "GCF"* ]]; then
p2="GCF"
else
p2="GCA"
fi

p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
Expand All @@ -68,7 +69,7 @@ do
else

end_path=$(basename $base_link)

fi

# attempting to download genome fasta
Expand Down Expand Up @@ -222,9 +223,21 @@ do

fi

if [ $debug_flag == "true" ]; then
if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then
mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
fi
if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then
mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa
fi
if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
fi
fi

rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp

else
Expand Down
Loading