diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index 0084cfc1..b9538b1b 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -7,10 +7,15 @@ import signal import psutil import shlex +import csv import sys import io import os +from typing import List + +from tiny.rna.configuration import CSVReader + rules_template = [{'Identity': ("Name", "N/A"), 'Strand': "both", 'Hierarchy': 0, @@ -20,6 +25,28 @@ 'Overlap': "partial"}] +def csv_factory(type: str, rows: List[dict], header=()): + """Returns the file contents of the specified config csv. The written header does NOT match + the fieldnames expected in rows. Fieldnames are expected to be the internal + short names defined in Configuration.CSVReader (for brevity)""" + + if type == "features.csv": + fields = list(CSVReader.tinyrna_sheet_fields['Features Sheet'].values()) + header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Features Sheet'].items()} + elif type == "samples.csv": + fields = list(CSVReader.tinyrna_sheet_fields['Samples Sheet'].values()) + header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Samples Sheet'].items()} + else: + sys.exit("Unsupported config file") + + csv_string = io.StringIO() + writer = csv.DictWriter(csv_string, fieldnames=fields) + writer.writerow(header) + writer.writerows(rows) + + return csv_string.getvalue() + + def get_dir_tree(root_path: str) -> dict: """Returns a nested dictionary representation of a given directory tree. diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index a3d56821..e06eb114 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -4,10 +4,11 @@ import unittest from unittest.mock import patch, mock_open, call -from tiny.rna.configuration import Configuration +from tiny.rna.configuration import Configuration, SamplesSheet +from unit_test_helpers import csv_factory -class ConfigurationTests(unittest.TestCase): +class BowtieIndexesTest(unittest.TestCase): @classmethod def setUpClass(self): self.root_cfg_dir = os.path.abspath("../tiny/templates") @@ -135,5 +136,69 @@ def test_verify_bowtie_build_outputs(self): self.assertListEqual(config['bt_index_files'], expected_ebwt) self.assertListEqual(mo.call_args_list, expected_writes) + +class SamplesSheetTest(unittest.TestCase): + + """Does SamplesSheet catch multi-assignment of control condition?""" + + def test_validate_control_group(self): + sheet = csv_factory("samples.csv", [ + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good + {'File': '3.fastq', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad + ]) # ^^^ + + exp_contains = r".*(multiple control conditions).*" + with self.assertRaisesRegex(AssertionError, exp_contains), \ + patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ + patch('tiny.rna.configuration.os.path.isfile', return_value=True): + SamplesSheet('mock_filename') + + """Does SamplesSheet catch duplicate entries for the same group and rep?""" + def test_validate_group_rep(self): + sheet = csv_factory("samples.csv", [ + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good + {'File': '3.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''} # Bad + ]) # ^^^ ^^^ + + exp_contains = r".*(same group and replicate).*" + with self.assertRaisesRegex(AssertionError, exp_contains), \ + patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ + patch('tiny.rna.configuration.os.path.isfile', return_value=True): + SamplesSheet('mock_filename') + + """Does SamplesSheet catch fastq files that don't exist, have a bad file extension, or are listed more than once?""" + def test_validate_fastq_filepath(self): + csv_rows = [ + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Bad + {'File': '2.fasta', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad + ] # ^^^^^^^ + sheet = csv_factory("samples.csv", csv_rows) + + # File doesn't exist + exp_contains = r".*(was not found).*" + with self.assertRaisesRegex(AssertionError, exp_contains), \ + patch('tiny.rna.configuration.open', mock_open(read_data=sheet)): + SamplesSheet('mock_filename') + + # Duplicate filename + exp_contains = r".*(listed more than once).*" + with self.assertRaisesRegex(AssertionError, exp_contains), \ + patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ + patch('tiny.rna.configuration.os.path.isfile', return_value=True): + SamplesSheet('mock_filename') + + # Bad file extension + exp_contains = r".*(\.fastq\(\.gz\) extension).*" + csv_rows.pop(0) + sheet = csv_factory("samples.csv", csv_rows) + with self.assertRaisesRegex(AssertionError, exp_contains), \ + patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ + patch('tiny.rna.configuration.os.path.isfile', return_value=True): + SamplesSheet('mock_filename') + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 6d9b7cc3..6d79207f 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -1,6 +1,5 @@ import io import os -import csv import unittest from unittest.mock import patch, mock_open @@ -26,31 +25,32 @@ def setUpClass(self): self.short_sam = helpers.read(self.short_sam_file) self.strand = {'sense': tuple('+'), 'antisense': tuple('-'), 'both': ('+', '-')} + self.csv = staticmethod(helpers.csv_factory) # Represents an unparsed Features Sheet row # Key is the user-facing column header self.csv_feat_row_dict = { - 'Select for...': "Class", - 'with value...': "CSR", - 'Alias by...': "Alias", - 'Tag': '', - 'Hierarchy': "1", - 'Strand': "antisense", - "5' End Nucleotide": '"C,G,U"', # Needs to be double-quoted due to commas - 'Length': "all", - 'Overlap': "Partial", - 'Feature Source': "test_file.gff3" + 'Key': "Class", + 'Value': "CSR", + 'Name': "Alias", + 'Tag': "", + 'Hierarchy': "1", + 'Strand': "antisense", + "nt5end": '"C,G,U"', # Needs to be double-quoted due to commas + 'Length': "all", + 'Overlap': "Partial", + 'Source': "test_file.gff3" } # Represents the parsed Features Sheet row above # Key is the internal short name _row = self.csv_feat_row_dict self.parsed_feat_rule = [{ - 'Identity': (_row['Select for...'], _row['with value...']), + 'Identity': (_row['Key'], _row['Value']), 'Tag': _row['Tag'], 'Hierarchy': int(_row['Hierarchy']), 'Strand': _row['Strand'], - 'nt5end': _row["5' End Nucleotide"].upper().translate({ord('U'): 'T'}), + 'nt5end': _row["nt5end"].upper().translate({ord('U'): 'T'}), 'Length': _row['Length'], 'Overlap': _row['Overlap'].lower() }] @@ -58,44 +58,29 @@ def setUpClass(self): # Represents an unparsed Samples Sheet row # Key is the user-facing column header self.csv_samp_row_dict = { - 'Input FASTQ Files': "test_file.fastq", - 'Sample/Group Name': "test_group", - 'Replicate Number': "0", - 'Control': "", - 'Normalization': '' + 'File': "test_file.fastq", + 'Group': "test_group", + 'Replicate': "0", + 'Control': "", + 'Normalization': "" } # This is the same Samples Sheet row above, but with internal names # It does NOT represent the parsed result of loading the Samples Sheet _row = self.csv_samp_row_dict self.parsed_samp_rule = { - 'File': _row['Input FASTQ Files'], - 'Group': _row['Sample/Group Name'], - 'Replicate': _row['Replicate Number'], + 'File': _row['File'], + 'Group': _row['Group'], + 'Replicate': _row['Replicate'], 'Control': _row['Control'], 'Normalization': _row['Normalization'] } # === HELPERS === - - @staticmethod - def csv(type, rows, header=()): - if type == "features.csv": - header = ['Select for...', 'with value...', 'Alias by...', 'Tag', 'Hierarchy', - 'Strand', "5' End Nucleotide", 'Length', 'Overlap', 'Feature Source'] - elif type == "samples.csv": - header = ['Input FASTQ Files', 'Sample/Group Name', 'Replicate Number', 'Control', 'Normalization'] - - csv_string = io.StringIO() - writer = csv.DictWriter(csv_string, fieldnames=header) - writer.writeheader() - writer.writerows(rows) - - return csv_string.getvalue() - def get_parsed_samples_row(self, row, exp_file): + def get_loaded_samples_row(self, row, exp_file): return [{ - 'Name': "_rep_".join(row[i] for i in ["Sample/Group Name", "Replicate Number"]), + 'Name': "_rep_".join(row[i] for i in ["Group", "Replicate"]), 'File': exp_file, 'Norm': row['Normalization'] }] @@ -109,13 +94,13 @@ def test_load_samples_single_cmd(self): inp_file = "test.fastq" exp_file = from_here(mock_samp_sheet_path, "test_aligned_seqs.sam") - row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file}) + row = dict(self.csv_samp_row_dict, **{'File': inp_file}) csv = self.csv("samples.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): inputs_step = counter.load_samples(mock_samp_sheet_path, is_pipeline=False) - expected_result = self.get_parsed_samples_row(row, exp_file) + expected_result = self.get_loaded_samples_row(row, exp_file) self.assertEqual(inputs_step, expected_result) """Does load_samples correctly parse a single record samples.csv for pipeline invocation?""" @@ -125,13 +110,13 @@ def test_load_samples_single_pipeline(self): inp_file = "test.fastq" exp_file = "test_aligned_seqs.sam" - row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file}) + row = dict(self.csv_samp_row_dict, **{'File': inp_file}) csv = self.csv("samples.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): inputs_pipeline = counter.load_samples(mock_samp_sheet_path, is_pipeline=True) - expected_result = self.get_parsed_samples_row(row, exp_file) + expected_result = self.get_loaded_samples_row(row, exp_file) self.assertEqual(inputs_pipeline, expected_result) """Does load_samples correctly handle duplicate samples? There should be no duplicates.""" @@ -150,21 +135,21 @@ def test_load_samples_duplicate(self): def test_load_samples_sam(self): sam_filename = "/fake/absolute/path/sample.sam" - row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': sam_filename}) + row = dict(self.csv_samp_row_dict, **{'File': sam_filename}) csv = self.csv("samples.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' inputs = counter.load_samples(dummy_file, is_pipeline=False) - expected_result = self.get_parsed_samples_row(row, sam_filename) + expected_result = self.get_loaded_samples_row(row, sam_filename) self.assertEqual(inputs, expected_result) """Does load_samples throw ValueError if a non-absolute path to a SAM file is provided?""" def test_load_samples_nonabs_path(self): bad = "./dne.sam" - row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad}) + row = dict(self.csv_samp_row_dict, **{'File': bad}) csv = self.csv("samples.csv", [row]) expected_error = "The following file must be expressed as an absolute path:\n" + bad @@ -178,7 +163,7 @@ def test_load_samples_nonabs_path(self): def test_load_samples_bad_extension(self): bad = "./bad_extension.xyz" - row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad}) + row = dict(self.csv_samp_row_dict, **{'File': bad}) csv = self.csv("samples.csv", [row]) expected_error = r"The filenames defined in your Samples Sheet must have a \.fastq\(\.gz\) or \.sam extension\.\n" \ @@ -201,8 +186,8 @@ def test_load_config_single_cmd(self): ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_file, row['Feature Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]])) + expected_gff_file = from_here(dummy_file, row['Source']) + expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) @@ -219,8 +204,8 @@ def test_load_config_single_pipeline(self): ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=True) expected_ruleset = self.parsed_feat_rule - expected_gff_file = os.path.basename(row['Feature Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]])) + expected_gff_file = os.path.basename(row['Source']) + expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) @@ -237,8 +222,8 @@ def test_load_config_duplicate_rules(self): ruleset, gff_files = counter.load_config(dummy_filename, False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_filename, row['Feature Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]])) + expected_gff_file = from_here(dummy_filename, row['Source']) + expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) @@ -247,7 +232,7 @@ def test_load_config_duplicate_rules(self): def test_load_config_rna_to_cDNA(self): row = self.csv_feat_row_dict.copy() - row["5' End Nucleotide"] = 'U' + row["nt5end"] = 'U' csv = self.csv("features.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): @@ -260,7 +245,7 @@ def test_load_config_rna_to_cDNA(self): def test_load_config_id_name_attr(self): row = self.csv_feat_row_dict.copy() - row['Alias by...'] = 'ID' + row['Name'] = 'ID' csv = self.csv("features.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): @@ -268,7 +253,7 @@ def test_load_config_id_name_attr(self): _, gff_files = counter.load_config(dummy_file, False) # Expect {file: [empty Name Attribute list]} - from_dummy = from_here(dummy_file, row['Feature Source']) + from_dummy = from_here(dummy_file, row['Source']) expected = defaultdict(list, zip([from_dummy], [[]])) self.assertEqual(gff_files, expected) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index ea02ac30..5a754853 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -128,7 +128,7 @@ def create_run_directory(self) -> str: return run_dir def get_outfile_path(self, infile: str = None) -> str: - """Returns the path and filename for the processed run config""" + """Returns the path and file for the processed run config""" if infile is None: infile = self.inf return self.joinpath(self['run_directory'], os.path.basename(infile)) @@ -173,8 +173,8 @@ def __init__(self, config_file: str, validate_inputs=False): self.setup_pipeline() self.setup_per_file() self.setup_ebwt_idx() - self.process_sample_sheet() - self.process_feature_sheet() + self.process_samples_sheet() + self.process_features_sheet() if validate_inputs: self.validate_inputs() def load_paths_config(self): @@ -202,59 +202,29 @@ def to_cwl_file_class(input_file_path): if genome is not None ] - def process_sample_sheet(self): - sample_sheet = self.paths.from_here(self['samples_csv']['path']) - sample_sheet_dir = os.path.dirname(sample_sheet) - groups_reps = Counter() + def process_samples_sheet(self): + samples_sheet_path = self.paths.from_here(self['samples_csv']['path']) + samples_sheet = SamplesSheet(samples_sheet_path) - csv_reader = CSVReader(sample_sheet, "Samples Sheet") - for row in csv_reader.rows(): - if not os.path.splitext(row['File'])[1] in [".fastq", ".gz"]: - raise ValueError("Files in samples.csv must have a .fastq(.gz) extension:\n%s" % (row['File'],)) - - fastq_file = self.from_here(row['File'], origin=sample_sheet_dir) - sample_basename = self.prefix(os.path.basename(fastq_file)) + self['sample_basenames'] = samples_sheet.sample_basenames + self['control_condition'] = samples_sheet.control_condition + self['run_deseq'] = samples_sheet.is_compatible_df - group_name = row['Group'] - rep_number = row['Replicate'] - groups_reps[group_name] += 1 + self['in_fq'] = [self.cwl_file(fq, verify=False) for fq in samples_sheet.fastq_files] + self['fastp_report_titles'] = [f"{g}_rep_{r}" for g, r in samples_sheet.groups_reps] - self.append_to('sample_basenames', sample_basename) - self.append_to('fastp_report_titles', f"{group_name}_rep_{rep_number}") - if row['Control'].lower() == 'true': - self['control_condition'] = group_name - - try: - self.append_to('in_fq', self.cwl_file(fastq_file)) - except FileNotFoundError: - line = csv_reader.line_num - sys.exit("The fastq file on line %d of your Samples Sheet was not found:\n%s" % (line, fastq_file)) + def process_features_sheet(self): + features_sheet = self.paths.from_here(self['features_csv']['path']) + features_sheet_dir = os.path.dirname(features_sheet) - self.check_deseq_compatibility(groups_reps) - - def check_deseq_compatibility(self, sample_groups): - total_samples = sum(sample_groups.values()) - total_coefficients = len(sample_groups) - degrees_of_freedom = total_samples - total_coefficients - - if degrees_of_freedom < 1: - self['run_deseq'] = False - print("Your experiment design has less than one degree of freedom, which is incompatible " - "with DESeq2. The DGE step will be skipped and most plots will not be produced.", - file=sys.stderr) - - def process_feature_sheet(self): - feature_sheet = self.paths.from_here(self['features_csv']['path']) - feature_sheet_dir = os.path.dirname(feature_sheet) - - csv_reader = CSVReader(feature_sheet, "Features Sheet") + csv_reader = CSVReader(features_sheet, "Features Sheet") for row in csv_reader.rows(): - gff_file = self.from_here(row['Source'], origin=feature_sheet_dir) + gff_file = self.from_here(row['Source'], origin=features_sheet_dir) try: self.append_if_absent('gff_files', self.cwl_file(gff_file)) except FileNotFoundError: - line = csv_reader.line_num - sys.exit("The GFF file on line %d of your Features Sheet was not found:\n%s" % (line, gff_file)) + row_num = csv_reader.row_num + sys.exit("The GFF file on line %d of your Features Sheet was not found:\n%s" % (row_num, gff_file)) def setup_per_file(self): """Per-library settings lists to be populated by entries from samples_csv""" @@ -408,10 +378,104 @@ def main(): config_object.write_processed_config(f"processed_{file_basename}") +class SamplesSheet: + def __init__(self, file): + self.csv = CSVReader(file, "Samples Sheet") + self.basename = os.path.basename(file) + self.dir = os.path.dirname(file) + self.file = file + + self.fastq_files = [] + self.groups_reps = [] + self.sample_basenames = [] + self.control_condition = None + self.is_compatible_df = False + + self.read_csv() + + def read_csv(self): + reps_per_group = Counter() + for row in self.csv.rows(): + fastq_file = Configuration.joinpath(self.dir, row['File']) + group_name = row['Group'] + rep_number = row['Replicate'] + is_control = row['Control'].lower() == 'true' + basename = self.get_sample_basename(fastq_file) + + self.validate_fastq_filepath(fastq_file) + self.validate_group_rep(group_name, rep_number) + self.validate_control_group(is_control, group_name) + + self.fastq_files.append(fastq_file) + self.sample_basenames.append(basename) + self.groups_reps.append((group_name, rep_number)) + reps_per_group[group_name] += 1 + + if is_control: self.control_condition = group_name + + self.is_compatible_df = self.validate_deseq_compatibility(reps_per_group) + + def validate_fastq_filepath(self, file: str): + """Checks file existence, extension, and duplicate entries. + Args: + file: fastq file path. For which has already been resolved relative to self.dir + """ + + root, ext = os.path.splitext(file) + + assert os.path.isfile(file), \ + "The fastq file on row {row_num} of {selfname} was not found:\n\t{file}" \ + .format(row_num=self.csv.row_num, selfname=self.basename, file=file) + + assert ext in (".fastq", ".gz"), \ + "Files in {selfname} must have a .fastq(.gz) extension (row {row_num})"\ + .format(selfname=self.basename, row_num=self.csv.row_num) + + assert file not in self.fastq_files, \ + "Fastq files cannot be listed more than once in {selfname} (row {row_num})"\ + .format(selfname=self.basename, row_num=self.csv.row_num) + + def validate_group_rep(self, group:str, rep:str): + assert (group, rep) not in self.groups_reps, \ + "The same group and replicate number cannot appear on " \ + "more than one row in {selfname} (row {row_num})"\ + .format(selfname=self.basename, row_num=self.csv.row_num) + + def validate_control_group(self, is_control: bool, group: str): + if not is_control: return + assert self.control_condition in (group, None), \ + "tinyRNA does not support multiple control conditions " \ + "(row {row_num} in {selfname}).\nHowever, if the control condition " \ + "is unspecified, all possible comparisons will be made and this " \ + "should accomplish your goal."\ + .format(row_num=self.csv.row_num, selfname=self.basename) + + @staticmethod + def validate_deseq_compatibility(sample_groups: Counter) -> bool: + total_samples = sum(sample_groups.values()) + total_coefficients = len(sample_groups) + degrees_of_freedom = total_samples - total_coefficients + + if degrees_of_freedom < 1: + print("Your experiment design has less than one degree of freedom, which is incompatible " + "with DESeq2. The DGE step will be skipped and most plots will not be produced.", + file=sys.stderr) + return False + else: + return True + + @staticmethod + def get_sample_basename(filename): + root, _ = os.path.splitext(filename) + return os.path.basename(root) + class CSVReader(csv.DictReader): """A simple wrapper class for csv.DictReader - This makes field labels consistent across the project and simplifies the code + This makes field labels consistent across the project, simplifies the code, and + allows for validation and reordering of column names. We also keep track of the + row number for diagnostic outputs; the base class offers the line_num attribute, + but line_num != row_num if a record spans multiple lines in the csv. """ # user-facing name -> internal short name @@ -440,6 +504,7 @@ class CSVReader(csv.DictReader): def __init__(self, filename: str, doctype: str = None): self.doctype = doctype self.tinyrna_file = filename + self.row_num = 0 try: self.tinyrna_fields = tuple(CSVReader.tinyrna_sheet_fields[doctype].values()) except KeyError as ke: @@ -455,6 +520,7 @@ def rows(self): self.validate_csv_header(header) for row in self: + self.row_num += 1 yield row def validate_csv_header(self, header: OrderedDict):