diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index d08249d8..114da656 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -1,6 +1,7 @@ import pandas as pd -import mmap +import gzip import json +import mmap import csv import sys import os @@ -9,6 +10,7 @@ from abc import abstractmethod, ABC from typing import Tuple, Optional, Union from collections import Counter, defaultdict +from glob import glob from ..util import make_filename @@ -385,9 +387,10 @@ def write_output_logfile(self): self.df_to_csv(self.pipeline_stats_df, "Summary Statistics", self.prefix, "summary_stats") def library_has_collapser_outputs(self, other: LibraryStats) -> bool: - collapsed_fa = other.library['basename'] + "_collapsed.fa" - if os.path.isfile(collapsed_fa): - other.library['collapsed'] = collapsed_fa + # Collapser outputs may have been gzipped. Accept either filename. + collapsed_fa = glob(other.library['basename'] + "_collapsed.fa*") + if len(collapsed_fa): + other.library['collapsed'] = collapsed_fa[0] return True else: self.missing_collapser_outputs.append(other.library['basename']) @@ -420,8 +423,19 @@ def get_fastp_stats(self, other: LibraryStats) -> Union[Tuple[int, int], Tuple[N def get_collapser_stats(self, other: LibraryStats) -> Optional[int]: """Determine the total number of unique sequences (after quality filtering) in this library""" - with open(other.library['collapsed'], 'r') as f: - with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + collapsed_fa = other.library['collapsed'] + + if collapsed_fa.endswith('.gz'): + # Random access is much harder with gzip + # Instead, get last header in last 250 bytes + with gzip.open(collapsed_fa) as g: + g.seek(-250, os.SEEK_END) + lines = g.readlines() + while lines[-1] == "": lines.pop(-1) + last_header = lines[-2].decode() + count = last_header.split('_count=')[0][1:] + else: + with open(collapsed_fa, 'r') as f, mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: from_pos = mm.rfind(b">") + 1 to_pos = mm.rfind(b"_count=") count = mm[from_pos:to_pos]