From 45e3824ff388cd224c308566a809392be27c4b01 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 9 Aug 2022 20:27:32 -0700 Subject: [PATCH 1/2] SummaryStats now searches for collapsed outputs using a glob pattern with a terminating wildcard. This will return the correct filename regardless of the user's compression settings for tiny-collapse. When a gzipped filename is detected, we have to take a more brutish approach to parsing the unique sequence # from the last FASTA header. Read the last 250 bytes from the file, parse lines, remove trailing blank lines, then assume the second to last line to be the final header. --- tiny/rna/counter/statistics.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index d08249d8..6441b2b7 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -1,6 +1,7 @@ import pandas as pd -import mmap +import gzip import json +import mmap import csv import sys import os @@ -9,6 +10,7 @@ from abc import abstractmethod, ABC from typing import Tuple, Optional, Union from collections import Counter, defaultdict +from glob import glob from ..util import make_filename @@ -385,9 +387,10 @@ def write_output_logfile(self): self.df_to_csv(self.pipeline_stats_df, "Summary Statistics", self.prefix, "summary_stats") def library_has_collapser_outputs(self, other: LibraryStats) -> bool: - collapsed_fa = other.library['basename'] + "_collapsed.fa" - if os.path.isfile(collapsed_fa): - other.library['collapsed'] = collapsed_fa + # Collapser outputs may have been gzipped. Accept either filename. + collapsed_fa = glob(other.library['basename'] + "_collapsed.fa*") + if os.path.isfile(collapsed_fa[0]): + other.library['collapsed'] = collapsed_fa[0] return True else: self.missing_collapser_outputs.append(other.library['basename']) @@ -420,8 +423,19 @@ def get_fastp_stats(self, other: LibraryStats) -> Union[Tuple[int, int], Tuple[N def get_collapser_stats(self, other: LibraryStats) -> Optional[int]: """Determine the total number of unique sequences (after quality filtering) in this library""" - with open(other.library['collapsed'], 'r') as f: - with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + collapsed_fa = other.library['collapsed'] + + if collapsed_fa.endswith('.gz'): + # Random access is much harder with gzip + # Instead, get last header in last 250 bytes + with gzip.open(collapsed_fa) as g: + g.seek(-250, os.SEEK_END) + lines = g.readlines() + while lines[-1] == "": lines.pop(-1) + last_header = lines[-2].decode() + count = last_header.split('_count=')[0][1:] + else: + with open(collapsed_fa, 'r') as f, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: from_pos = mm.rfind(b">") + 1 to_pos = mm.rfind(b"_count=") count = mm[from_pos:to_pos] From b500240e6129bebd22968c63f14ff00d3bd76ff7 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 10 Aug 2022 16:39:59 -0700 Subject: [PATCH 2/2] Correction for standalone runs --- tiny/rna/counter/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 6441b2b7..114da656 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -389,7 +389,7 @@ def write_output_logfile(self): def library_has_collapser_outputs(self, other: LibraryStats) -> bool: # Collapser outputs may have been gzipped. Accept either filename. collapsed_fa = glob(other.library['basename'] + "_collapsed.fa*") - if os.path.isfile(collapsed_fa[0]): + if len(collapsed_fa): other.library['collapsed'] = collapsed_fa[0] return True else: