From 43414d238781a123e47b05ba5eb0426fd3d86c84 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 9 Nov 2022 11:04:00 -0500 Subject: [PATCH 1/2] GFFValidator can now read gzipped fasta files. The gzip r/w interface has been moved from tiny-collapse into tiny.rna.util so that both it and GFFValidator can use it --- tiny/rna/collapser.py | 7 ++----- tiny/rna/counter/validation.py | 7 +++++-- tiny/rna/util.py | 7 ++++++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tiny/rna/collapser.py b/tiny/rna/collapser.py index 4e3504ff..2b9422ff 100644 --- a/tiny/rna/collapser.py +++ b/tiny/rna/collapser.py @@ -8,21 +8,18 @@ import argparse import builtins -import gzip import os from collections import Counter -from functools import partial from typing import Tuple, Iterable +from tiny.rna.util import gzip_open as gz_f + try: from _collections import _count_elements # Load Counter's C helper function if it is available except ImportError: from collections import _count_elements # Slower mapping[elem] = mapping.get(elem,default_val)+1 -# The GZIP read/write interface used by seq_counter() and seq2fasta() -gz_f = partial(gzip.GzipFile, compresslevel=6, fileobj=None, mtime=0) - def get_args() -> 'argparse.NameSpace': """Get command line arguments""" diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 40485e25..b9d21615 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -1,11 +1,12 @@ import functools import subprocess import sys +import os from collections import Counter, defaultdict -from tiny.rna.util import sorted_natural from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables +from tiny.rna.util import sorted_natural, gzip_open class ReportFormatter: @@ -198,7 +199,9 @@ def chroms_shared_with_genomes(self, genome_fastas): genome_chroms = set() for fasta in genome_fastas: - with open(fasta, 'rb') as f: + _, ext = os.path.splitext(fasta) + file_if = gzip_open if ext == '.gz' else open + with file_if(fasta, 'rb') as f: for line in f: if line[0] == ord('>'): genome_chroms.add(line[1:].strip().decode()) diff --git a/tiny/rna/util.py b/tiny/rna/util.py index ac94fd0c..c222469e 100644 --- a/tiny/rna/util.py +++ b/tiny/rna/util.py @@ -1,6 +1,7 @@ import argparse import functools import textwrap +import gzip import time import os import re @@ -93,4 +94,8 @@ def sorted_natural(lines, reverse=False): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [convert(c) for c in re.split(r'(\d+)', key)] - return sorted(lines, key=alphanum_key, reverse=reverse) \ No newline at end of file + return sorted(lines, key=alphanum_key, reverse=reverse) + + +# File IO interface for reading and writing Gzip files +gzip_open = functools.partial(gzip.GzipFile, compresslevel=6, fileobj=None, mtime=0) \ No newline at end of file From 46e0e983beb96a61178ae63a7dd435f0412ae615 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 9 Nov 2022 15:38:47 -0500 Subject: [PATCH 2/2] Slightly clarified logic in GFFValidator.chroms_shared_with_genomes() --- tiny/rna/counter/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 0cb8eb7a..dbc6d5cc 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -213,10 +213,12 @@ def chroms_shared_with_genomes(self, genome_fastas): genome_chroms = set() for fasta in genome_fastas: - if not os.path.isfile(fasta): continue - - _, ext = os.path.splitext(fasta) - file_if = gzip_open if ext == '.gz' else open + if not os.path.isfile(fasta): + continue + elif fasta.endswith('.gz'): + file_if = gzip_open + else: + file_if = open with file_if(fasta, 'rb') as f: for line in f: