diff --git a/setup.py b/setup.py index aa2ed2d..555367f 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ ], entry_points={ 'console_scripts': [ + 'AttachBarcodes = sctools.platform:BarcodePlatform.attach_barcodes', 'Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes', 'SplitBam = sctools.platform:GenericPlatform.split_bam', 'CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics', diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 9589609..eeb8a25 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -629,3 +629,318 @@ def attach_barcodes(cls, args=None): cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) return 0 + + +class BarcodePlatform(GenericPlatform): + """Command Line Interface for extracting and attaching barcodes with specified positions + generalizing TenXV2 attach barcodes + + Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the + corresponding barcode's start position and and length are provided. The sample barcode is extracted + from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file + + This class defines several methods that are created as CLI tools when sctools is installed + (see setup.py) + + Attributes + ---------- + cell_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the cell barcode and the tags to + assign the sequence and quality of the cell barcode + molecule_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the molecule barcode and the tags + to assign the sequence and quality of the molecule barcode + sample_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the sample barcode and the tags + to assign the sequence and quality of the sample barcode + + Methods + ------- + attach_barcodes() + Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse + (r2) bam file + + """ + cell_barcode = None + molecule_barcode = None + sample_barcode = None + + @classmethod + def _validate_barcode_args(cls, args): + """Validates that the barcode start position is greater than 0 + + Parameters + ---------- + args : object + arguments list, The default value of None, when passed to `parser.parse_args` + causes the parser to read `sys.argv` + + Returns + ------- + args : object + return arguments list if valid + + """ + # check that if a barcode start position is provided, its length is also (and vice versa) + cls._validate_barcode_length_and_position(args.cell_barcode_start_pos, args.cell_barcode_length) + cls._validate_barcode_length_and_position(args.molecule_barcode_start_pos, args.molecule_barcode_length) + cls._validate_barcode_length_and_position(args.sample_barcode_start_pos, args.sample_barcode_length) + + # check that an index fastq is provided sample barcode length and position are given + if args.i1 is None and args.sample_barcode_length: + raise argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode') + + # check that cell and molecule barcodes don't overlap + if args.cell_barcode_length and args.molecule_barcode_length: + cls._validate_barcode_input(args.molecule_barcode_start_pos, + args.cell_barcode_start_pos + args.cell_barcode_length) + + return args + + @classmethod + def _validate_barcode_length_and_position(cls, barcode_start_position, barcode_length): + """Checks that either that both barcode length and position are given or that neither are given as arguments + + Parameters + ---------- + barcode_start_position : int + the user defined start position (base pairs) of the barcode + + barcode_length : int + the user defined length (base pairs) of the barcode + + Returns + ------- + given_value : int + return given value if valid + + """ + barcode_start_pos_exists = bool(barcode_start_position) or (barcode_start_position == 0) + barcode_length_exists = bool(barcode_length) + # (XOR boolean logic) + if (barcode_start_pos_exists != barcode_length_exists): + raise argparse.ArgumentError('Invalid position/length, both position and length must be provided by the user together') + + @classmethod + def _validate_barcode_input(cls, given_value, min_value): + """Validates that the barcode input is greater than a min value + + Parameters + ---------- + given_value : int + the given value that must be greater than the min_value, + (barcode length or barcode starting position) + + min_value : int + the min value that the given_value must be greater than + + Returns + ------- + given_value : int + return given value if valid + + """ + if given_value < min_value: + raise argparse.ArgumentTypeError('Invalid barcode length/position') + return given_value + + @classmethod + def _validate_barcode_start_pos(cls, given_value): + """Validates that the barcode start position is greater than 0 + + Parameters + ---------- + given_value : Union[int, str] + the given start position of the barcode to validate + + Returns + ------- + given_value : int + returns the start position if it is valid + + """ + return cls._validate_barcode_input(int(given_value), 0) + + @classmethod + def _validate_barcode_length(cls, given_value): + """Validates that the barcode length is greater than 1 + + Parameters + ---------- + given_value : Union[int, str] + the given length of the barcode to validate + + Returns + ------- + given_value : int + returns the length if it is valid + + """ + return cls._validate_barcode_input(int(given_value), 1) + + @classmethod + def _tag_bamfile(cls, + input_bamfile_name: str, + output_bamfile_name: str, + tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None: + """Adds tags from fastq file(s) to a bam file. + + Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from + `input_bamfile_name`, and writes the result to `output_bamfile_name` + + Parameters + ---------- + input_bamfile_name : str + input bam + output_bamfile_name : str + output bam + tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] + Iterable of generators that yield barcodes from fastq files + + """ + bam_tagger = bam.Tagger(input_bamfile_name) + bam_tagger.tag(output_bamfile_name, tag_generators) + + @classmethod + def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: + """Create tag generators from fastq files. + + Tag generators are iterators that run over fastq records, they extract and yield all of the + barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode. + + Parameters + ---------- + r1 : str + forward fastq file, where possibly the cell and/or molecule barcode is found + i1 : str, optional + index fastq file, where the sample barcode is found + whitelist : str, optional + A file that contains a list of acceptable cell barcodes + + Returns + ------- + tag_generators : List[EmbeddedBarcodeGenerator] + EmbeddedBarcodeGenerators containing barcodes from the given fastq + + """ + tag_generators = [] + barcode_args = {'fastq_files': r1} + + if i1: + barcode_args['embedded_barcodes'] = [cls.sample_barcode] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) + + if whitelist: + barcode_args['whitelist'] = whitelist + if cls.cell_barcode: + barcode_args['embedded_cell_barcode'] = cls.cell_barcode + if cls.molecule_barcode: + barcode_args['other_embedded_barcodes'] = cls.molecule_barcode + tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) + + else: + # for all the barcodes that have a length and starting position specified + barcode_args['embedded_barcodes'] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) + + return tag_generators + + @classmethod + def attach_barcodes(cls, args=None): + """Command line entrypoint for attaching barcodes to a bamfile. + + Parameters + ---------- + args : Iterable[str], optional + arguments list, The default value of None, when passed to `parser.parse_args` + causes the parser to read `sys.argv` + + Returns + ------- + return_call : 0 + return call if the program completes successfully + + """ + parser = argparse.ArgumentParser() + parser.add_argument('--r1', + required=True, + help='read 1 fastq file, where the cell and molecule barcode is found') + parser.add_argument('--u2', + required=True, + help='unaligned bam, can be converted from fastq read 2' + 'using picard FastqToSam') + parser.add_argument('-o', + '--output-bamfile', + required=True, + help='filename for tagged bam') + parser.add_argument('-w', + '--whitelist', + default=None, + help='optional cell barcode whitelist. If provided, corrected barcodes ' + 'will also be output when barcodes are observed within 1ED of a ' + 'whitelisted barcode') + parser.add_argument('--i1', + default=None, + help='(optional) i7 index fastq file, where the sample barcode is found') + parser.add_argument('--sample-barcode-start-position', + dest='sample_barcode_start_pos', + default=None, + help='the user defined start position (base pairs) of the sample barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument('--sample-barcode-length', + dest='sample_barcode_length', + default=None, + help='the user defined length (base pairs) of the sample barcode', + type=cls._validate_barcode_length) + parser.add_argument('--cell-barcode-start-position', + dest='cell_barcode_start_pos', + default=None, + help='the user defined start position, in base pairs, of the cell barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument('--cell-barcode-length', + dest='cell_barcode_length', + default=None, + help='the user defined length, in base pairs, of the cell barcode', + type=cls._validate_barcode_length) + parser.add_argument('--molecule-barcode-start-position', + dest='molecule_barcode_start_pos', + default=None, + help='the user defined start position, in base pairs, of the molecule barcode ' + '(must be not overlap cell barcode if cell barcode is provided)', + type=cls._validate_barcode_start_pos) + parser.add_argument('--molecule-barcode-length', + dest='molecule_barcode_length', + default=None, + help='the user defined length, in base pairs, of the molecule barcode', + type=cls._validate_barcode_length) + + # parse and validate the args + if args: + args = parser.parse_args(args) + else: + args = parser.parse_args() + cls._validate_barcode_args(args) + + # if the length and there for the start pos have been given as args + # get the appropriate barcodes + if args.cell_barcode_length: + cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos, + end=args.cell_barcode_start_pos + args.cell_barcode_length, + quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY) + if args.molecule_barcode_length: + cls.molecule_barcode = fastq.EmbeddedBarcode(start=args.molecule_barcode_start_pos, + end=args.molecule_barcode_start_pos + args.molecule_barcode_length, + quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY) + if args.sample_barcode_length: + cls.sample_barcode = fastq.EmbeddedBarcode(start=args.sample_barcode_start_pos, + end=args.sample_barcode_start_pos + args.sample_barcode_length, + quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY) + + # make the tags and attach the barcodes + tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) + cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) + + return 0