From 0cec2c77bbf870bc5ecfd1e881b00ff58fe25823 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Tue, 5 Feb 2019 10:32:06 -0500 Subject: [PATCH 01/15] first draft for generalizing attaching barcodes --- setup.py | 1 + src/sctools/platform.py | 186 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 173 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index aa2ed2d..28042fa 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ ], entry_points={ 'console_scripts': [ + 'AttachBarcodes = sctools.platform:Attach.attach_barcodes', 'Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes', 'SplitBam = sctools.platform:GenericPlatform.split_bam', 'CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics', diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 9589609..c8487a7 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -545,7 +545,7 @@ def _tag_bamfile( @classmethod def _make_tag_generators( - cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: + cls, r1, i1=None, whitelist=None, no_cell_barcode=False, no_molecule_barcode=False) -> List[fastq.EmbeddedBarcodeGenerator]: """Create tag generators from fastq files. Tag generators are iterators that run over fastq records, they extract and yield all of the @@ -567,24 +567,31 @@ def _make_tag_generators( EmbeddedBarcodeGenerators containing barcodes from 10x fastq records """ + tag_generators = [] + barcode_args = {"fastq_files": r1} + # generator for sample barcodes + if i1 is not None: + barcode_args["embedded_barcodes"] = [cls.sample_barcode] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) # generator for cell and molecule barcodes if whitelist is not None: - tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes( - fastq_files=r1, - embedded_cell_barcode=cls.cell_barcode, - whitelist=whitelist, - other_embedded_barcodes=[cls.molecule_barcode], - )) + barcode_args["whitelist"] = whitelist + if not no_cell_barcode: + barcode_args["embedded_cell_barcode"] = cls.cell_barcode + if not no_molecule_barcode: + barcode_args["other_embedded_barcodes"] = [cls.molecule_barcode] + tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) else: - tag_generators.append(fastq.EmbeddedBarcodeGenerator( - fastq_files=r1, embedded_barcodes=[cls.cell_barcode, cls.molecule_barcode])) - - # generator for sample barcodes - if i1 is not None: - tag_generators.append(fastq.EmbeddedBarcodeGenerator( - fastq_files=i1, embedded_barcodes=[cls.sample_barcode])) + if not no_cell_barcode: + barcode_args["embedded_barcodes"] = [cls.cell_barcode] + if not no_molecule_barcode: + if not no_cell_barcode: + barcode_args["embedded_barcodes"].append(cls.molecule_barcode) + else: + barcode_args["embedded_barcodes"] = [cls.molecule_barcode] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) return tag_generators @classmethod @@ -629,3 +636,154 @@ def attach_barcodes(cls, args=None): cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) return 0 + + +class Attach(GenericPlatform): + """Command Line Interface for 10x Genomics v2 RNA-sequencing programs + + This class defines several methods that are created as CLI tools when sctools is installed + (see setup.py) + + Attributes + ---------- + cell_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the cell barcode and the tags to + assign the sequence and quality of the cell barcode + molecule_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the molecule barcode and the tags + to assign the sequence and quality of the molecule barcode + sample_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the sample barcode and the tags + to assign the sequence and quality of the sample barcode + + Methods + ------- + attach_barcodes() + Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse + (r2) bam file + + """ + + + attach = TenXV2 + @classmethod + def update_barcode_positions(cls, cell_barcode_start_pos, cell_barcode_length, + molecule_barcode_start_pos, molecule_barcode_length, + sample_barcode_start_pos, sample_barcode_length): + cls.attach.cell_barcode = fastq.EmbeddedBarcode( + start=cell_barcode_start_pos, + end=cell_barcode_start_pos + cell_barcode_length, + quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY) + cls.attach.molecule_barcode = fastq.EmbeddedBarcode( + start=cls.validate_barcode_input(molecule_barcode_start_pos, cell_barcode_start_pos + cell_barcode_length), + end=molecule_barcode_start_pos + molecule_barcode_length, + quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY) + cls.attach.sample_barcode = fastq.EmbeddedBarcode( + start=sample_barcode_start_pos, + end=sample_barcode_start_pos + sample_barcode_length, + quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY) + + @classmethod + def validate_barcode_input(cls, given_value, min_value): + if given_value < min_value: + raise argparse.ArgumentTypeError("The value must be a number >= " + str(min_value)) + return given_value + + @classmethod + def validate_barcode_start_pos(cls, given_value): + return cls.validate_barcode_input(int(given_value), 0) + + @classmethod + def validate_barcode_length(cls, given_value): + return cls.validate_barcode_input(int(given_value), 1) + + @classmethod + def attach_barcodes(cls, args=None): + """Command line entrypoint for attaching barcodes to a bamfile. + + Parameters + ---------- + args : Iterable[str], optional + arguments list, for testing (see test/test_entrypoints.py for example). The default + value of None, when passed to `parser.parse_args` causes the parser to + read `sys.argv` + + Returns + ------- + return_call : 0 + return call if the program completes successfully + + """ + parser = argparse.ArgumentParser() + parser.add_argument( + '--r1', required=True, + help='read 1 fastq file for a 10x genomics v2 experiment') + parser.add_argument( + '--u2', required=True, + help='unaligned bam containing cDNA fragments. Can be converted from fastq read 2' + 'using picard FastqToSam') + parser.add_argument( + '--i1', default=None, + help='(optional) i7 index fastq file for a 10x genomics experiment') + parser.add_argument('-o', '--output-bamfile', required=True, + help='filename for tagged bam') + parser.add_argument('-w', '--whitelist', default=None, + help='optional cell barcode whitelist. If provided, corrected barcodes ' + 'will also be output when barcodes are observed within 1ED of a ' + 'whitelisted barcode') + parser.add_argument("--cell-barcode-start-position", + dest="cell_barcode_start_pos", + default=0, + help='the user defined start position, in base pairs, of the cell barcode', + type=cls.validate_barcode_start_pos) + parser.add_argument("--cell-barcode-length", + dest="cell_barcode_length", + default=16, + help='the user defined length, in base pairs, of the cell barcode', + type=cls.validate_barcode_length) + parser.add_argument('--no-cell-barcode', + dest="no_cell_barcode", + help="do no not tag the bam file with a cell barcode", + action='store_true') + parser.add_argument("--molecule-barcode-start-position", + dest="molecule_barcode_start_pos", + default=16, + help='the user defined start position, in base pairs, of the molecule barcode', + type=cls.validate_barcode_start_pos) + parser.add_argument("--molecule-barcode-length", + dest="molecule_barcode_length", + default=10, + help='the user defined length, in base pairs, of the molecule barcode', + type=cls.validate_barcode_length) + parser.add_argument('--no-molecule-barcode', + dest="no_molecule_barcode", + help="do no not tag the bam file with a molecule barcode", + action='store_true') + parser.add_argument("--sample-barcode-start-position", + dest="sample_barcode_start_pos", + default=0, + help='the user defined start position (base pairs) of the sample barcode', + type=cls.validate_barcode_start_pos) + parser.add_argument("--sample-barcode-length", + dest="sample_barcode_length", + default=16, + help='the user defined length (base pairs) of the sample barcode', + type=cls.validate_barcode_length) + + if args is not None: + args = parser.parse_args(args) + tag_generators = cls.attach._make_tag_generators(args.r1, args.i1, args.whitelist) + else: + args = parser.parse_args() + cls.update_barcode_positions(args.cell_barcode_start_pos, args.cell_barcode_length, + args.molecule_barcode_start_pos, args.molecule_barcode_length, + args.sample_barcode_start_pos, args.sample_barcode_length) + tag_generators = cls.attach._make_tag_generators(args.r1, args.i1, args.whitelist, args.no_cell_barcode, args.no_molecule_barcode) + cls.attach._tag_bamfile(args.u2, args.output_bamfile, tag_generators) + + return 0 + + From 25b56bdf73cbaf6a48f788072d16730d21755050 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Thu, 7 Feb 2019 11:54:31 -0500 Subject: [PATCH 02/15] updated generalized class --- src/sctools/platform.py | 297 +++++++++++++++++++++------------------- 1 file changed, 154 insertions(+), 143 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index c8487a7..ac5b1eb 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -477,10 +477,8 @@ def group_qc_outputs(cls, args: Iterable[str]=None) -> int: class TenXV2(GenericPlatform): """Command Line Interface for 10x Genomics v2 RNA-sequencing programs - This class defines several methods that are created as CLI tools when sctools is installed (see setup.py) - Attributes ---------- cell_barcode : fastq.EmbeddedBarcode @@ -492,13 +490,11 @@ class TenXV2(GenericPlatform): sample_barcode : fastq.EmbeddedBarcode A data class that defines the start and end position of the sample barcode and the tags to assign the sequence and quality of the sample barcode - Methods ------- attach_barcodes() Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse (r2) bam file - """ # 10x contains three barcodes embedded within sequencing reads. The below objects define the # start and end points of those barcodes relative to the start of the sequence, and the @@ -526,10 +522,8 @@ def _tag_bamfile( output_bamfile_name: str, tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None: """Adds tags from fastq file(s) to a bam file. - Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from `input_bamfile_name`, and writes the result to `output_bamfile_name` - Parameters ---------- input_bamfile_name : str @@ -538,20 +532,17 @@ def _tag_bamfile( output bam tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] Iterable of generators that yield barcodes from fastq files - """ bam_tagger = bam.Tagger(input_bamfile_name) bam_tagger.tag(output_bamfile_name, tag_generators) @classmethod def _make_tag_generators( - cls, r1, i1=None, whitelist=None, no_cell_barcode=False, no_molecule_barcode=False) -> List[fastq.EmbeddedBarcodeGenerator]: + cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: """Create tag generators from fastq files. - Tag generators are iterators that run over fastq records, they extract and yield all of the barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and optionally, the sample barcode. - Parameters ---------- r1 : str @@ -560,56 +551,44 @@ def _make_tag_generators( index fastq file whitelist : str, optional A file that contains a list of acceptable cell barcodes - Returns ------- tag_generators, List[EmbeddedBarcodeGenerator] EmbeddedBarcodeGenerators containing barcodes from 10x fastq records - """ - tag_generators = [] - barcode_args = {"fastq_files": r1} - # generator for sample barcodes - if i1 is not None: - barcode_args["embedded_barcodes"] = [cls.sample_barcode] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) # generator for cell and molecule barcodes if whitelist is not None: - barcode_args["whitelist"] = whitelist - if not no_cell_barcode: - barcode_args["embedded_cell_barcode"] = cls.cell_barcode - if not no_molecule_barcode: - barcode_args["other_embedded_barcodes"] = [cls.molecule_barcode] - tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) + tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes( + fastq_files=r1, + embedded_cell_barcode=cls.cell_barcode, + whitelist=whitelist, + other_embedded_barcodes=[cls.molecule_barcode], + )) else: - if not no_cell_barcode: - barcode_args["embedded_barcodes"] = [cls.cell_barcode] - if not no_molecule_barcode: - if not no_cell_barcode: - barcode_args["embedded_barcodes"].append(cls.molecule_barcode) - else: - barcode_args["embedded_barcodes"] = [cls.molecule_barcode] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) + tag_generators.append(fastq.EmbeddedBarcodeGenerator( + fastq_files=r1, embedded_barcodes=[cls.cell_barcode, cls.molecule_barcode])) + + # generator for sample barcodes + if i1 is not None: + tag_generators.append(fastq.EmbeddedBarcodeGenerator( + fastq_files=i1, embedded_barcodes=[cls.sample_barcode])) return tag_generators @classmethod def attach_barcodes(cls, args=None): """Command line entrypoint for attaching barcodes to a bamfile. - Parameters ---------- args : Iterable[str], optional arguments list, for testing (see test/test_entrypoints.py for example). The default value of None, when passed to `parser.parse_args` causes the parser to read `sys.argv` - Returns ------- return_call : 0 return call if the program completes successfully - """ parser = argparse.ArgumentParser() parser.add_argument( @@ -639,150 +618,182 @@ def attach_barcodes(cls, args=None): class Attach(GenericPlatform): - """Command Line Interface for 10x Genomics v2 RNA-sequencing programs - - This class defines several methods that are created as CLI tools when sctools is installed - (see setup.py) - Attributes - ---------- - cell_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the cell barcode and the tags to - assign the sequence and quality of the cell barcode - molecule_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the molecule barcode and the tags - to assign the sequence and quality of the molecule barcode - sample_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the sample barcode and the tags - to assign the sequence and quality of the sample barcode + cell_barcode = {"start": 0, + "end": 16, + "quality_tag": consts.QUALITY_CELL_BARCODE_TAG_KEY, + "sequence_tag": consts.RAW_CELL_BARCODE_TAG_KEY} + molecule_barcode = {"start": 16, + "end": 26, + "quality_tag": consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + "sequence_tag": consts.RAW_MOLECULE_BARCODE_TAG_KEY} + sample_barcode = {"start": 0, + "end": 8, + "quality_tag": consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + "sequence_tag": consts.RAW_SAMPLE_BARCODE_TAG_KEY} - Methods - ------- - attach_barcodes() - Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse - (r2) bam file + @classmethod + def _get_embedded_barcode(cls, barcode): + return fastq.EmbeddedBarcode(**barcode) - """ + @classmethod + def _update_barcode(cls, barcode, barcode_start_pos=None, barcode_length=None): + if barcode_start_pos is not None: + cls._validate_barcode_start_pos(barcode_start_pos) + barcode["start"] = int(barcode_start_pos) + if barcode_length is not None: + cls._validate_barcode_length(barcode_length) + barcode["end"] = barcode["start"] + int(barcode_length) - attach = TenXV2 - @classmethod - def update_barcode_positions(cls, cell_barcode_start_pos, cell_barcode_length, - molecule_barcode_start_pos, molecule_barcode_length, - sample_barcode_start_pos, sample_barcode_length): - cls.attach.cell_barcode = fastq.EmbeddedBarcode( - start=cell_barcode_start_pos, - end=cell_barcode_start_pos + cell_barcode_length, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY) - cls.attach.molecule_barcode = fastq.EmbeddedBarcode( - start=cls.validate_barcode_input(molecule_barcode_start_pos, cell_barcode_start_pos + cell_barcode_length), - end=molecule_barcode_start_pos + molecule_barcode_length, - quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY) - cls.attach.sample_barcode = fastq.EmbeddedBarcode( - start=sample_barcode_start_pos, - end=sample_barcode_start_pos + sample_barcode_length, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY) + cls._validate_barcode_input(cls.molecule_barcode["start"], cls.cell_barcode["end"]) @classmethod - def validate_barcode_input(cls, given_value, min_value): + def _validate_barcode_input(cls, given_value, min_value): if given_value < min_value: - raise argparse.ArgumentTypeError("The value must be a number >= " + str(min_value)) + raise argparse.ArgumentTypeError("Invalid barcode lenght/position") return given_value @classmethod - def validate_barcode_start_pos(cls, given_value): - return cls.validate_barcode_input(int(given_value), 0) + def _validate_barcode_start_pos(cls, given_value): + return cls._validate_barcode_input(int(given_value), 0) @classmethod - def validate_barcode_length(cls, given_value): - return cls.validate_barcode_input(int(given_value), 1) + def _validate_barcode_length(cls, given_value): + return cls._validate_barcode_input(int(given_value), 1) @classmethod - def attach_barcodes(cls, args=None): - """Command line entrypoint for attaching barcodes to a bamfile. + def _tag_bamfile(cls, + input_bamfile_name: str, + output_bamfile_name: str, + tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None: + bam_tagger = bam.Tagger(input_bamfile_name) + bam_tagger.tag(output_bamfile_name, tag_generators) - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` + @classmethod + def _make_tag_generators(cls, + r1, + i1=None, + whitelist=None, + no_cell_barcode=None, + no_molecule_barcode=None) -> List[fastq.EmbeddedBarcodeGenerator]: + tag_generators = [] + barcode_args = {"fastq_files": r1} - Returns - ------- - return_call : 0 - return call if the program completes successfully + if i1 is not None: + barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.sample_barcode)] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) + + if whitelist is not None: + barcode_args["whitelist"] = whitelist + if no_cell_barcode is None: + barcode_args["embedded_cell_barcode"] = cls._get_embedded_barcode(cls.cell_barcode) + if no_molecule_barcode is None: + barcode_args["other_embedded_barcodes"] = cls._get_embedded_barcode(cls.molecule_barcode) + + else: + if no_cell_barcode is None and no_molecule_barcode is None: + barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.cell_barcode), + cls._get_embedded_barcode(cls.molecule_barcode)] + elif no_cell_barcode is None: + barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.cell_barcode)] + elif no_molecule_barcode is None: + barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.molecule_barcode)] + tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) + + return tag_generators + + @classmethod + def attach_barcodes(cls, args=None): - """ parser = argparse.ArgumentParser() - parser.add_argument( - '--r1', required=True, - help='read 1 fastq file for a 10x genomics v2 experiment') - parser.add_argument( - '--u2', required=True, - help='unaligned bam containing cDNA fragments. Can be converted from fastq read 2' - 'using picard FastqToSam') - parser.add_argument( - '--i1', default=None, - help='(optional) i7 index fastq file for a 10x genomics experiment') - parser.add_argument('-o', '--output-bamfile', required=True, + parser.add_argument('--r1', + required=True, + help='read 1 fastq file for a 10x genomics v2 experiment') + parser.add_argument('--u2', + required=True, + help='unaligned bam containing cDNA fragments. Can be converted from fastq read 2' + 'using picard FastqToSam') + parser.add_argument('-o', + '--output-bamfile', + required=True, help='filename for tagged bam') - parser.add_argument('-w', '--whitelist', default=None, + parser.add_argument('-w', + '--whitelist', + default=None, help='optional cell barcode whitelist. If provided, corrected barcodes ' 'will also be output when barcodes are observed within 1ED of a ' 'whitelisted barcode') - parser.add_argument("--cell-barcode-start-position", - dest="cell_barcode_start_pos", - default=0, - help='the user defined start position, in base pairs, of the cell barcode', - type=cls.validate_barcode_start_pos) - parser.add_argument("--cell-barcode-length", - dest="cell_barcode_length", - default=16, - help='the user defined length, in base pairs, of the cell barcode', - type=cls.validate_barcode_length) + + parser.add_argument('--i1', + default=None, + help='(optional) i7 index fastq file for a 10x genomics experiment') + known_args = parser.parse_known_args()[0] + if known_args.i1 is not None: + parser.add_argument("--sample-barcode-start-position", + dest="sample_barcode_start_pos", + default=None, + help='the user defined start position (base pairs) of the sample barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument("--sample-barcode-length", + dest="sample_barcode_length", + default=16, + help='the user defined length (base pairs) of the sample barcode', + type=cls._validate_barcode_length) + parser.add_argument('--no-cell-barcode', dest="no_cell_barcode", + default=None, help="do no not tag the bam file with a cell barcode", action='store_true') - parser.add_argument("--molecule-barcode-start-position", - dest="molecule_barcode_start_pos", - default=16, - help='the user defined start position, in base pairs, of the molecule barcode', - type=cls.validate_barcode_start_pos) - parser.add_argument("--molecule-barcode-length", - dest="molecule_barcode_length", - default=10, - help='the user defined length, in base pairs, of the molecule barcode', - type=cls.validate_barcode_length) + known_args = parser.parse_known_args()[0] + if known_args.no_cell_barcode is None: + parser.add_argument("--cell-barcode-start-position", + dest="cell_barcode_start_pos", + default=None, + help='the user defined start position, in base pairs, of the cell barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument("--cell-barcode-length", + dest="cell_barcode_length", + default=None, + help='the user defined length, in base pairs, of the cell barcode', + type=cls._validate_barcode_length) + parser.add_argument('--no-molecule-barcode', dest="no_molecule_barcode", + default=None, help="do no not tag the bam file with a molecule barcode", action='store_true') - parser.add_argument("--sample-barcode-start-position", - dest="sample_barcode_start_pos", - default=0, - help='the user defined start position (base pairs) of the sample barcode', - type=cls.validate_barcode_start_pos) - parser.add_argument("--sample-barcode-length", - dest="sample_barcode_length", - default=16, - help='the user defined length (base pairs) of the sample barcode', - type=cls.validate_barcode_length) + known_args = parser.parse_known_args()[0] + if known_args.no_molecule_barcode is None: + parser.add_argument("--molecule-barcode-start-position", + dest="molecule_barcode_start_pos", + default=16, + help='the user defined start position, in base pairs, of the molecule barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument("--molecule-barcode-length", + dest="molecule_barcode_length", + default=10, + help='the user defined length, in base pairs, of the molecule barcode', + type=cls._validate_barcode_length) if args is not None: args = parser.parse_args(args) - tag_generators = cls.attach._make_tag_generators(args.r1, args.i1, args.whitelist) + tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) else: args = parser.parse_args() - cls.update_barcode_positions(args.cell_barcode_start_pos, args.cell_barcode_length, - args.molecule_barcode_start_pos, args.molecule_barcode_length, - args.sample_barcode_start_pos, args.sample_barcode_length) - tag_generators = cls.attach._make_tag_generators(args.r1, args.i1, args.whitelist, args.no_cell_barcode, args.no_molecule_barcode) - cls.attach._tag_bamfile(args.u2, args.output_bamfile, tag_generators) + if args.i1 is not None: + cls._update_barcode(cls.sample_barcode, args.sample_barcode_start_pos, args.sample_barcode_length) + if args.no_cell_barcode is None: + cls._update_barcode(cls.cell_barcode, args.cell_barcode_start_pos, args.cell_barcode_length) + if args.no_molecule_barcode is None: + cls._update_barcode(cls.molecule_barcode, args.molecule_barcode_start_pos, args.molecule_barcode_length) + tag_generators = cls._make_tag_generators(args.r1, + args.i1, + args.whitelist, + args.no_cell_barcode, + args.no_molecule_barcode) + cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) return 0 From ad4664e4df4c6dc2f3e2dc4973a01189a01eb6ca Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Fri, 8 Feb 2019 10:24:23 -0500 Subject: [PATCH 03/15] Fixed PR #1 --- src/sctools/platform.py | 195 ++++++++++++++++++---------------------- 1 file changed, 85 insertions(+), 110 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index ac5b1eb..caf5bac 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -619,39 +619,22 @@ def attach_barcodes(cls, args=None): class Attach(GenericPlatform): - cell_barcode = {"start": 0, - "end": 16, - "quality_tag": consts.QUALITY_CELL_BARCODE_TAG_KEY, - "sequence_tag": consts.RAW_CELL_BARCODE_TAG_KEY} - molecule_barcode = {"start": 16, - "end": 26, - "quality_tag": consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - "sequence_tag": consts.RAW_MOLECULE_BARCODE_TAG_KEY} - sample_barcode = {"start": 0, - "end": 8, - "quality_tag": consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - "sequence_tag": consts.RAW_SAMPLE_BARCODE_TAG_KEY} + cell_barcode = None + molecule_barcode = None + sample_barcode = None @classmethod - def _get_embedded_barcode(cls, barcode): + def _get_barcode(cls, barcode_start_pos, barcode_length, barcode_quality_tag, barcode_sequence_tag): + barcode = {"start": barcode_start_pos, + "end": barcode_length, + "quality_tag": barcode_quality_tag, + "sequence_tag": barcode_sequence_tag} return fastq.EmbeddedBarcode(**barcode) - @classmethod - def _update_barcode(cls, barcode, barcode_start_pos=None, barcode_length=None): - if barcode_start_pos is not None: - cls._validate_barcode_start_pos(barcode_start_pos) - barcode["start"] = int(barcode_start_pos) - - if barcode_length is not None: - cls._validate_barcode_length(barcode_length) - barcode["end"] = barcode["start"] + int(barcode_length) - - cls._validate_barcode_input(cls.molecule_barcode["start"], cls.cell_barcode["end"]) - @classmethod def _validate_barcode_input(cls, given_value, min_value): if given_value < min_value: - raise argparse.ArgumentTypeError("Invalid barcode lenght/position") + raise argparse.ArgumentTypeError("Invalid barcode length/position") return given_value @classmethod @@ -671,41 +654,35 @@ def _tag_bamfile(cls, bam_tagger.tag(output_bamfile_name, tag_generators) @classmethod - def _make_tag_generators(cls, - r1, - i1=None, - whitelist=None, - no_cell_barcode=None, - no_molecule_barcode=None) -> List[fastq.EmbeddedBarcodeGenerator]: + def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: tag_generators = [] barcode_args = {"fastq_files": r1} - if i1 is not None: - barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.sample_barcode)] + if i1: + barcode_args["embedded_barcodes"] = [cls.sample_barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) - if whitelist is not None: + if whitelist: barcode_args["whitelist"] = whitelist - if no_cell_barcode is None: - barcode_args["embedded_cell_barcode"] = cls._get_embedded_barcode(cls.cell_barcode) - if no_molecule_barcode is None: - barcode_args["other_embedded_barcodes"] = cls._get_embedded_barcode(cls.molecule_barcode) + if cls.cell_barcode: + barcode_args["embedded_cell_barcode"] = cls.cell_barcode + if cls.molecule_barcode: + barcode_args["other_embedded_barcodes"] = cls.molecule_barcode else: - if no_cell_barcode is None and no_molecule_barcode is None: - barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.cell_barcode), - cls._get_embedded_barcode(cls.molecule_barcode)] - elif no_cell_barcode is None: - barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.cell_barcode)] - elif no_molecule_barcode is None: - barcode_args["embedded_barcodes"] = [cls._get_embedded_barcode(cls.molecule_barcode)] + if cls.cell_barcode and cls.molecule_barcode: + barcode_args["embedded_barcodes"] = [cls.cell_barcode, + cls.molecule_barcode] + elif cls.cell_barcode: + barcode_args["embedded_barcodes"] = [cls.cell_barcode] + elif cls.molecule_barcode: + barcode_args["embedded_barcodes"] = [cls.molecule_barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) return tag_generators @classmethod def attach_barcodes(cls, args=None): - parser = argparse.ArgumentParser() parser.add_argument('--r1', required=True, @@ -724,77 +701,75 @@ def attach_barcodes(cls, args=None): help='optional cell barcode whitelist. If provided, corrected barcodes ' 'will also be output when barcodes are observed within 1ED of a ' 'whitelisted barcode') - parser.add_argument('--i1', default=None, help='(optional) i7 index fastq file for a 10x genomics experiment') - known_args = parser.parse_known_args()[0] - if known_args.i1 is not None: - parser.add_argument("--sample-barcode-start-position", - dest="sample_barcode_start_pos", - default=None, - help='the user defined start position (base pairs) of the sample barcode', - type=cls._validate_barcode_start_pos) - parser.add_argument("--sample-barcode-length", - dest="sample_barcode_length", - default=16, - help='the user defined length (base pairs) of the sample barcode', - type=cls._validate_barcode_length) - - parser.add_argument('--no-cell-barcode', - dest="no_cell_barcode", + parser.add_argument("--sample-barcode-start-position", + dest="sample_barcode_start_pos", default=None, - help="do no not tag the bam file with a cell barcode", - action='store_true') - known_args = parser.parse_known_args()[0] - if known_args.no_cell_barcode is None: - parser.add_argument("--cell-barcode-start-position", - dest="cell_barcode_start_pos", - default=None, - help='the user defined start position, in base pairs, of the cell barcode', - type=cls._validate_barcode_start_pos) - parser.add_argument("--cell-barcode-length", - dest="cell_barcode_length", - default=None, - help='the user defined length, in base pairs, of the cell barcode', - type=cls._validate_barcode_length) - - parser.add_argument('--no-molecule-barcode', - dest="no_molecule_barcode", + help='the user defined start position (base pairs) of the sample barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument("--sample-barcode-length", + dest="sample_barcode_length", default=None, - help="do no not tag the bam file with a molecule barcode", - action='store_true') - known_args = parser.parse_known_args()[0] - if known_args.no_molecule_barcode is None: - parser.add_argument("--molecule-barcode-start-position", - dest="molecule_barcode_start_pos", - default=16, - help='the user defined start position, in base pairs, of the molecule barcode', - type=cls._validate_barcode_start_pos) - parser.add_argument("--molecule-barcode-length", - dest="molecule_barcode_length", - default=10, - help='the user defined length, in base pairs, of the molecule barcode', - type=cls._validate_barcode_length) - - if args is not None: + help='the user defined length (base pairs) of the sample barcode', + type=cls._validate_barcode_length) + parser.add_argument("--cell-barcode-start-position", + dest="cell_barcode_start_pos", + default=None, + help='the user defined start position, in base pairs, of the cell barcode', + type=cls._validate_barcode_start_pos) + parser.add_argument("--cell-barcode-length", + dest="cell_barcode_length", + default=None, + help='the user defined length, in base pairs, of the cell barcode', + type=cls._validate_barcode_length) + parser.add_argument("--molecule-barcode-start-position", + dest="molecule_barcode_start_pos", + default=None, + help='the user defined start position, in base pairs, of the molecule barcode ' + '(must be not overlap cell barcode if cell barcode is provided)', + type=cls._validate_barcode_start_pos) + parser.add_argument("--molecule-barcode-length", + dest="molecule_barcode_length", + default=None, + help='the user defined length, in base pairs, of the molecule barcode', + type=cls._validate_barcode_length) + if args: args = parser.parse_args(args) - tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) else: args = parser.parse_args() - if args.i1 is not None: - cls._update_barcode(cls.sample_barcode, args.sample_barcode_start_pos, args.sample_barcode_length) - if args.no_cell_barcode is None: - cls._update_barcode(cls.cell_barcode, args.cell_barcode_start_pos, args.cell_barcode_length) - if args.no_molecule_barcode is None: - cls._update_barcode(cls.molecule_barcode, args.molecule_barcode_start_pos, args.molecule_barcode_length) - tag_generators = cls._make_tag_generators(args.r1, - args.i1, - args.whitelist, - args.no_cell_barcode, - args.no_molecule_barcode) - cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) - return 0 + if ((bool(args.cell_barcode_start_pos) or args.cell_barcode_start_pos == 0) + != bool(args.cell_barcode_length) or + (bool(args.molecule_barcode_start_pos) or args.molecule_barcode_start_pos == 0) + != bool(args.molecule_barcode_length) or + (bool(args.sample_barcode_start_pos) or args.sample_barcode_start_pos == 0) + != bool(args.sample_barcode_length)): + argparse.ArgumentError("Invalid barocde pos/length arguments, barcode start pos and barcode length must be specified together") + if args.i1 is None and args.sample_barcode_length: + argparse.ArgumentError("An i7 index fastq file must be given to attach a sample barcode") + if args.cell_barcode_length and args.molecule_barcode_length: + cls._validate_barcode_input(args.molecule_barcode_start_pos, + args.cell_barcode_start_pos + args.cell_barcode_length) + + if args.cell_barcode_length: + cls.cell_barcode = cls._get_barcode(args.cell_barcode_start_pos, + args.cell_barcode_length, + consts.QUALITY_CELL_BARCODE_TAG_KEY, + consts.RAW_CELL_BARCODE_TAG_KEY) + if args.molecule_barcode_length: + cls.molecule_barcode = cls._get_barcode(args.molecule_barcode_start_pos, + args.molecule_barcode_length, + consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + consts.RAW_MOLECULE_BARCODE_TAG_KEY) + if args.sample_barcode_length: + cls.sample_barcode = cls._get_barcode(args.sample_barcode_start_pos, + args.sample_barcode_length, + consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + consts.RAW_SAMPLE_BARCODE_TAG_KEY) + tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) + cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) + return 0 From 1651144778ac20da529e2b8fac850e914d7b433a Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Fri, 8 Feb 2019 10:26:48 -0500 Subject: [PATCH 04/15] styling comment changes --- src/sctools/platform.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index caf5bac..4f7d415 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -477,8 +477,10 @@ def group_qc_outputs(cls, args: Iterable[str]=None) -> int: class TenXV2(GenericPlatform): """Command Line Interface for 10x Genomics v2 RNA-sequencing programs + This class defines several methods that are created as CLI tools when sctools is installed (see setup.py) + Attributes ---------- cell_barcode : fastq.EmbeddedBarcode @@ -490,11 +492,13 @@ class TenXV2(GenericPlatform): sample_barcode : fastq.EmbeddedBarcode A data class that defines the start and end position of the sample barcode and the tags to assign the sequence and quality of the sample barcode + Methods ------- attach_barcodes() Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse (r2) bam file + """ # 10x contains three barcodes embedded within sequencing reads. The below objects define the # start and end points of those barcodes relative to the start of the sequence, and the @@ -522,8 +526,10 @@ def _tag_bamfile( output_bamfile_name: str, tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None: """Adds tags from fastq file(s) to a bam file. + Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from `input_bamfile_name`, and writes the result to `output_bamfile_name` + Parameters ---------- input_bamfile_name : str @@ -532,6 +538,7 @@ def _tag_bamfile( output bam tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] Iterable of generators that yield barcodes from fastq files + """ bam_tagger = bam.Tagger(input_bamfile_name) bam_tagger.tag(output_bamfile_name, tag_generators) @@ -540,9 +547,11 @@ def _tag_bamfile( def _make_tag_generators( cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: """Create tag generators from fastq files. + Tag generators are iterators that run over fastq records, they extract and yield all of the barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and optionally, the sample barcode. + Parameters ---------- r1 : str @@ -551,10 +560,12 @@ def _make_tag_generators( index fastq file whitelist : str, optional A file that contains a list of acceptable cell barcodes + Returns ------- tag_generators, List[EmbeddedBarcodeGenerator] EmbeddedBarcodeGenerators containing barcodes from 10x fastq records + """ tag_generators = [] @@ -579,16 +590,19 @@ def _make_tag_generators( @classmethod def attach_barcodes(cls, args=None): """Command line entrypoint for attaching barcodes to a bamfile. + Parameters ---------- args : Iterable[str], optional arguments list, for testing (see test/test_entrypoints.py for example). The default value of None, when passed to `parser.parse_args` causes the parser to + read `sys.argv` Returns ------- return_call : 0 return call if the program completes successfully + """ parser = argparse.ArgumentParser() parser.add_argument( From 0b32ac21143159c0d775d43c0a946106afe6d4d1 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Fri, 8 Feb 2019 10:28:09 -0500 Subject: [PATCH 05/15] styling comment changes --- src/sctools/platform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 4f7d415..d80c980 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -596,13 +596,13 @@ def attach_barcodes(cls, args=None): args : Iterable[str], optional arguments list, for testing (see test/test_entrypoints.py for example). The default value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` + Returns ------- return_call : 0 return call if the program completes successfully - + """ parser = argparse.ArgumentParser() parser.add_argument( From 43f6ff352deee455db0868baa9a2c79699481f75 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Fri, 8 Feb 2019 10:32:18 -0500 Subject: [PATCH 06/15] styling comment changes --- src/sctools/platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index d80c980..78706e4 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -597,7 +597,7 @@ def attach_barcodes(cls, args=None): arguments list, for testing (see test/test_entrypoints.py for example). The default value of None, when passed to `parser.parse_args` causes the parser to read `sys.argv` - + Returns ------- return_call : 0 From 652c843757cd394cf36942b96b02e76e50259ce8 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Mon, 11 Feb 2019 14:46:12 -0500 Subject: [PATCH 07/15] barcode end postion bug --- src/sctools/platform.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 78706e4..2460040 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -691,6 +691,7 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde barcode_args["embedded_barcodes"] = [cls.cell_barcode] elif cls.molecule_barcode: barcode_args["embedded_barcodes"] = [cls.molecule_barcode] + print(barcode_args) tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) return tag_generators @@ -769,19 +770,19 @@ def attach_barcodes(cls, args=None): if args.cell_barcode_length: cls.cell_barcode = cls._get_barcode(args.cell_barcode_start_pos, - args.cell_barcode_length, + args.cell_barcode_start_pos + args.cell_barcode_length, consts.QUALITY_CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY) if args.molecule_barcode_length: cls.molecule_barcode = cls._get_barcode(args.molecule_barcode_start_pos, - args.molecule_barcode_length, - consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - consts.RAW_MOLECULE_BARCODE_TAG_KEY) + args.molecule_barcode_start_pos + args.molecule_barcode_length, + consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + consts.RAW_MOLECULE_BARCODE_TAG_KEY) if args.sample_barcode_length: cls.sample_barcode = cls._get_barcode(args.sample_barcode_start_pos, - args.sample_barcode_length, - consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - consts.RAW_SAMPLE_BARCODE_TAG_KEY) + args.sample_barcode_start_pos + args.sample_barcode_length, + consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + consts.RAW_SAMPLE_BARCODE_TAG_KEY) tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) From 2accfbcbf0d556b8dd39f90abe81c377fae6b889 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Wed, 13 Feb 2019 14:00:13 -0500 Subject: [PATCH 08/15] Fixed PR #2 --- setup.py | 2 +- src/sctools/platform.py | 73 ++++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/setup.py b/setup.py index 28042fa..555367f 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ ], entry_points={ 'console_scripts': [ - 'AttachBarcodes = sctools.platform:Attach.attach_barcodes', + 'AttachBarcodes = sctools.platform:BarcodePlatform.attach_barcodes', 'Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes', 'SplitBam = sctools.platform:GenericPlatform.split_bam', 'CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics', diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 2460040..2cd72d9 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -631,26 +631,31 @@ def attach_barcodes(cls, args=None): return 0 -class Attach(GenericPlatform): +class BarcodePlatform(GenericPlatform): cell_barcode = None molecule_barcode = None sample_barcode = None - @classmethod - def _get_barcode(cls, barcode_start_pos, barcode_length, barcode_quality_tag, barcode_sequence_tag): - barcode = {"start": barcode_start_pos, - "end": barcode_length, - "quality_tag": barcode_quality_tag, - "sequence_tag": barcode_sequence_tag} - return fastq.EmbeddedBarcode(**barcode) - @classmethod def _validate_barcode_input(cls, given_value, min_value): if given_value < min_value: raise argparse.ArgumentTypeError("Invalid barcode length/position") return given_value + @classmethod + def _validate_barcode_args(cls, args): + # check that both the barcode length and position are given as arguments + if ((bool(args.cell_barcode_start_pos) or args.cell_barcode_start_pos == 0) != bool(args.cell_barcode_length) or + (bool(args.molecule_barcode_start_pos) or args.molecule_barcode_start_pos == 0) != bool(args.molecule_barcode_length) or + (bool(args.sample_barcode_start_pos) or args.sample_barcode_start_pos == 0) != bool(args.sample_barcode_length)): + argparse.ArgumentError("Invalid barocde pos/length arguments, barcode start pos and barcode length must be specified together") + + if args.i1 is None and args.sample_barcode_length: + argparse.ArgumentError("An i7 index fastq file must be given to attach a sample barcode") + + return args + @classmethod def _validate_barcode_start_pos(cls, given_value): return cls._validate_barcode_input(int(given_value), 0) @@ -682,16 +687,10 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde barcode_args["embedded_cell_barcode"] = cls.cell_barcode if cls.molecule_barcode: barcode_args["other_embedded_barcodes"] = cls.molecule_barcode + tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) else: - if cls.cell_barcode and cls.molecule_barcode: - barcode_args["embedded_barcodes"] = [cls.cell_barcode, - cls.molecule_barcode] - elif cls.cell_barcode: - barcode_args["embedded_barcodes"] = [cls.cell_barcode] - elif cls.molecule_barcode: - barcode_args["embedded_barcodes"] = [cls.molecule_barcode] - print(barcode_args) + barcode_args["embedded_barcodes"] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) return tag_generators @@ -701,10 +700,10 @@ def attach_barcodes(cls, args=None): parser = argparse.ArgumentParser() parser.add_argument('--r1', required=True, - help='read 1 fastq file for a 10x genomics v2 experiment') + help='read 1 fastq file') parser.add_argument('--u2', required=True, - help='unaligned bam containing cDNA fragments. Can be converted from fastq read 2' + help='unaligned bam, can be converted from fastq read 2' 'using picard FastqToSam') parser.add_argument('-o', '--output-bamfile', @@ -718,7 +717,7 @@ def attach_barcodes(cls, args=None): 'whitelisted barcode') parser.add_argument('--i1', default=None, - help='(optional) i7 index fastq file for a 10x genomics experiment') + help='(optional) i7 index fastq file') parser.add_argument("--sample-barcode-start-position", dest="sample_barcode_start_pos", default=None, @@ -754,35 +753,27 @@ def attach_barcodes(cls, args=None): args = parser.parse_args(args) else: args = parser.parse_args() + cls._validate_barcode_args(args) - if ((bool(args.cell_barcode_start_pos) or args.cell_barcode_start_pos == 0) - != bool(args.cell_barcode_length) or - (bool(args.molecule_barcode_start_pos) or args.molecule_barcode_start_pos == 0) - != bool(args.molecule_barcode_length) or - (bool(args.sample_barcode_start_pos) or args.sample_barcode_start_pos == 0) - != bool(args.sample_barcode_length)): - argparse.ArgumentError("Invalid barocde pos/length arguments, barcode start pos and barcode length must be specified together") - if args.i1 is None and args.sample_barcode_length: - argparse.ArgumentError("An i7 index fastq file must be given to attach a sample barcode") if args.cell_barcode_length and args.molecule_barcode_length: cls._validate_barcode_input(args.molecule_barcode_start_pos, args.cell_barcode_start_pos + args.cell_barcode_length) if args.cell_barcode_length: - cls.cell_barcode = cls._get_barcode(args.cell_barcode_start_pos, - args.cell_barcode_start_pos + args.cell_barcode_length, - consts.QUALITY_CELL_BARCODE_TAG_KEY, - consts.RAW_CELL_BARCODE_TAG_KEY) + cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos, + end=args.cell_barcode_start_pos + args.cell_barcode_length, + quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY) if args.molecule_barcode_length: - cls.molecule_barcode = cls._get_barcode(args.molecule_barcode_start_pos, - args.molecule_barcode_start_pos + args.molecule_barcode_length, - consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - consts.RAW_MOLECULE_BARCODE_TAG_KEY) + cls.molecule_barcode = fastq.EmbeddedBarcode(start=args.molecule_barcode_start_pos, + end=args.molecule_barcode_start_pos + args.molecule_barcode_length, + quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY) if args.sample_barcode_length: - cls.sample_barcode = cls._get_barcode(args.sample_barcode_start_pos, - args.sample_barcode_start_pos + args.sample_barcode_length, - consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - consts.RAW_SAMPLE_BARCODE_TAG_KEY) + cls.sample_barcode = fastq.EmbeddedBarcode(start=args.sample_barcode_start_pos, + end=args.sample_barcode_start_pos + args.sample_barcode_length, + quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, + sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY) tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) From 29956d2c5b77b61de305523e39cc298ed9cc2a00 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Wed, 13 Feb 2019 14:18:18 -0500 Subject: [PATCH 09/15] Fixed PR #2, made arg validation more human readable --- src/sctools/platform.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 2cd72d9..4caf60f 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -645,15 +645,30 @@ def _validate_barcode_input(cls, given_value, min_value): @classmethod def _validate_barcode_args(cls, args): - # check that both the barcode length and position are given as arguments - if ((bool(args.cell_barcode_start_pos) or args.cell_barcode_start_pos == 0) != bool(args.cell_barcode_length) or - (bool(args.molecule_barcode_start_pos) or args.molecule_barcode_start_pos == 0) != bool(args.molecule_barcode_length) or - (bool(args.sample_barcode_start_pos) or args.sample_barcode_start_pos == 0) != bool(args.sample_barcode_length)): + cell_barcode_start_pos_exists = bool(args.cell_barcode_start_pos) or (args.cell_barcode_start_pos == 0) + cell_barcode_length_exists = bool(args.cell_barcode_length) + + molecule_barcode_start_pos_exists = bool(args.molecule_barcode_start_pos) or (args.molecule_barcode_start_pos == 0) + molecule_barcode_length_exists = bool(args.molecule_barcode_length) + + sample_barcode_start_pos_exists = bool(args.sample_barcode_start_pos) or (args.sample_barcode_start_pos == 0) + sample_barcode_length_exists = bool(args.sample_barcode_length) + + # check that both the barcode length and position are given as arguments (XOR boolean logic) + if (cell_barcode_start_pos_exists != cell_barcode_length_exists or + molecule_barcode_start_pos_exists != molecule_barcode_length_exists or + sample_barcode_start_pos_exists != sample_barcode_length_exists): argparse.ArgumentError("Invalid barocde pos/length arguments, barcode start pos and barcode length must be specified together") + # check that an index fastq is provided sample barcode length and position are given if args.i1 is None and args.sample_barcode_length: argparse.ArgumentError("An i7 index fastq file must be given to attach a sample barcode") + # check that cell and molecule barcodes don't overlap + if args.cell_barcode_length and args.molecule_barcode_length: + cls._validate_barcode_input(args.molecule_barcode_start_pos, + args.cell_barcode_start_pos + args.cell_barcode_length) + return args @classmethod @@ -755,10 +770,6 @@ def attach_barcodes(cls, args=None): args = parser.parse_args() cls._validate_barcode_args(args) - if args.cell_barcode_length and args.molecule_barcode_length: - cls._validate_barcode_input(args.molecule_barcode_start_pos, - args.cell_barcode_start_pos + args.cell_barcode_length) - if args.cell_barcode_length: cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos, end=args.cell_barcode_start_pos + args.cell_barcode_length, From 1393a1493cdd32d1504ea1fa8804ea7aa6e044e3 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Thu, 14 Feb 2019 10:44:08 -0500 Subject: [PATCH 10/15] add comments/documentation to the class --- src/sctools/platform.py | 139 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 4caf60f..d98d396 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -632,19 +632,73 @@ def attach_barcodes(cls, args=None): class BarcodePlatform(GenericPlatform): + """Command Line Interface for 10x Genomics v2 RNA-sequencing programs + + This class defines several methods that are created as CLI tools when sctools is installed + (see setup.py) + Attributes + ---------- + cell_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the cell barcode and the tags to + assign the sequence and quality of the cell barcode + molecule_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the molecule barcode and the tags + to assign the sequence and quality of the molecule barcode + sample_barcode : fastq.EmbeddedBarcode + A data class that defines the start and end position of the sample barcode and the tags + to assign the sequence and quality of the sample barcode + + Methods + ------- + attach_barcodes() + Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse + (r2) bam file + + """ cell_barcode = None molecule_barcode = None sample_barcode = None @classmethod def _validate_barcode_input(cls, given_value, min_value): + """Validates that the barcode input is greater than a min value + + Parameters + ---------- + given_value : int + the given value that must be greater than the min_value, + (barcode length or barcode starting position) + + min_value : int + the min value that the given_value must be greater than + + Returns + ------- + given_value : int + return given value if valid + + """ if given_value < min_value: raise argparse.ArgumentTypeError("Invalid barcode length/position") return given_value @classmethod def _validate_barcode_args(cls, args): + """Validates that the barcode start position is greater than 0 + + Parameters + ---------- + args : Iterable[str] + arguments list, The default value of None, when passed to `parser.parse_args` + causes the parser to read `sys.argv` + + Returns + ------- + args : Iterable[str], optional + return arguments list if valid + + """ cell_barcode_start_pos_exists = bool(args.cell_barcode_start_pos) or (args.cell_barcode_start_pos == 0) cell_barcode_length_exists = bool(args.cell_barcode_length) @@ -654,7 +708,8 @@ def _validate_barcode_args(cls, args): sample_barcode_start_pos_exists = bool(args.sample_barcode_start_pos) or (args.sample_barcode_start_pos == 0) sample_barcode_length_exists = bool(args.sample_barcode_length) - # check that both the barcode length and position are given as arguments (XOR boolean logic) + # check that both the barcode length and position are given as arguments + # or that neither barcode length and position are given as arguments (XOR boolean logic) if (cell_barcode_start_pos_exists != cell_barcode_length_exists or molecule_barcode_start_pos_exists != molecule_barcode_length_exists or sample_barcode_start_pos_exists != sample_barcode_length_exists): @@ -673,10 +728,36 @@ def _validate_barcode_args(cls, args): @classmethod def _validate_barcode_start_pos(cls, given_value): + """Validates that the barcode start position is greater than 0 + + Parameters + ---------- + given_value : int + the given start position of the barcode to validate + + Returns + ------- + given_value, int + returns the start position if it is valid + + """ return cls._validate_barcode_input(int(given_value), 0) @classmethod def _validate_barcode_length(cls, given_value): + """Validates that the barcode length is greater than 1 + + Parameters + ---------- + given_value : int + the given length of the barcode to validate + + Returns + ------- + given_value, int + returns the length if it is valid + + """ return cls._validate_barcode_input(int(given_value), 1) @classmethod @@ -684,11 +765,47 @@ def _tag_bamfile(cls, input_bamfile_name: str, output_bamfile_name: str, tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None: + """Adds tags from fastq file(s) to a bam file. + + Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from + `input_bamfile_name`, and writes the result to `output_bamfile_name` + + Parameters + ---------- + input_bamfile_name : str + input bam + output_bamfile_name : str + output bam + tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] + Iterable of generators that yield barcodes from fastq files + + """ bam_tagger = bam.Tagger(input_bamfile_name) bam_tagger.tag(output_bamfile_name, tag_generators) @classmethod def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]: + """Create tag generators from fastq files. + + Tag generators are iterators that run over fastq records, they extract and yield all of the + barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and + optionally, the sample barcode. + + Parameters + ---------- + r1 : str + forward fastq file + i1 : str, optional + index fastq file + whitelist : str, optional + A file that contains a list of acceptable cell barcodes + + Returns + ------- + tag_generators, List[EmbeddedBarcodeGenerator] + EmbeddedBarcodeGenerators containing barcodes from 10x fastq records + + """ tag_generators = [] barcode_args = {"fastq_files": r1} @@ -705,6 +822,7 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) else: + # for all the barcodes that have a length and starting position specified barcode_args["embedded_barcodes"] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) @@ -712,6 +830,20 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde @classmethod def attach_barcodes(cls, args=None): + """Command line entrypoint for attaching barcodes to a bamfile. + + Parameters + ---------- + args : Iterable[str], optional + arguments list, The default value of None, when passed to `parser.parse_args` + causes the parser to read `sys.argv` + + Returns + ------- + return_call : 0 + return call if the program completes successfully + + """ parser = argparse.ArgumentParser() parser.add_argument('--r1', required=True, @@ -764,12 +896,16 @@ def attach_barcodes(cls, args=None): default=None, help='the user defined length, in base pairs, of the molecule barcode', type=cls._validate_barcode_length) + + # parse and validate the args if args: args = parser.parse_args(args) else: args = parser.parse_args() cls._validate_barcode_args(args) + # if the length and there for the start pos have been given as args + # get the appropriate barcodes if args.cell_barcode_length: cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos, end=args.cell_barcode_start_pos + args.cell_barcode_length, @@ -786,6 +922,7 @@ def attach_barcodes(cls, args=None): quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY) + # make the tags and attach the barcodes tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) From 9f03d3a8b06fc00dd3c22789c059f89f569f18b5 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Thu, 14 Feb 2019 11:05:22 -0500 Subject: [PATCH 11/15] updated comments/documentation style for return values --- src/sctools/platform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index d98d396..12711f8 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -737,7 +737,7 @@ def _validate_barcode_start_pos(cls, given_value): Returns ------- - given_value, int + given_value : int returns the start position if it is valid """ @@ -754,7 +754,7 @@ def _validate_barcode_length(cls, given_value): Returns ------- - given_value, int + given_value : int returns the length if it is valid """ @@ -802,7 +802,7 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde Returns ------- - tag_generators, List[EmbeddedBarcodeGenerator] + tag_generators : List[EmbeddedBarcodeGenerator] EmbeddedBarcodeGenerators containing barcodes from 10x fastq records """ From 364209243722135fc30693df7969e6e941fcbffa Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Wed, 20 Feb 2019 11:10:29 -0500 Subject: [PATCH 12/15] Fixed PR #57 comments --- src/sctools/platform.py | 57 ++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 12711f8..c99781c 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -680,7 +680,7 @@ def _validate_barcode_input(cls, given_value, min_value): """ if given_value < min_value: - raise argparse.ArgumentTypeError("Invalid barcode length/position") + raise argparse.ArgumentTypeError('Invalid barcode length/position') return given_value @classmethod @@ -695,29 +695,32 @@ def _validate_barcode_args(cls, args): Returns ------- - args : Iterable[str], optional + args : Iterable[str] return arguments list if valid """ + # check that both the barcode length and position are given as arguments + # or that neither barcode length and position are given as arguments (XOR boolean logic) cell_barcode_start_pos_exists = bool(args.cell_barcode_start_pos) or (args.cell_barcode_start_pos == 0) cell_barcode_length_exists = bool(args.cell_barcode_length) + if (cell_barcode_start_pos_exists != cell_barcode_length_exists): + argparse.ArgumentError('Invalid cell barocde position/length, both position and length must be provided by the user together') molecule_barcode_start_pos_exists = bool(args.molecule_barcode_start_pos) or (args.molecule_barcode_start_pos == 0) molecule_barcode_length_exists = bool(args.molecule_barcode_length) + if (molecule_barcode_start_pos_exists != molecule_barcode_length_exists): + argparse.ArgumentError('Invalid molecule barocde position/length, both position and length must be provided by the user together') + sample_barcode_start_pos_exists = bool(args.sample_barcode_start_pos) or (args.sample_barcode_start_pos == 0) sample_barcode_length_exists = bool(args.sample_barcode_length) + if (sample_barcode_start_pos_exists != sample_barcode_length_exists): + argparse.ArgumentError('Invalid sample barocde position/length, both position and length must be provided by the user together') - # check that both the barcode length and position are given as arguments - # or that neither barcode length and position are given as arguments (XOR boolean logic) - if (cell_barcode_start_pos_exists != cell_barcode_length_exists or - molecule_barcode_start_pos_exists != molecule_barcode_length_exists or - sample_barcode_start_pos_exists != sample_barcode_length_exists): - argparse.ArgumentError("Invalid barocde pos/length arguments, barcode start pos and barcode length must be specified together") # check that an index fastq is provided sample barcode length and position are given if args.i1 is None and args.sample_barcode_length: - argparse.ArgumentError("An i7 index fastq file must be given to attach a sample barcode") + argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode') # check that cell and molecule barcodes don't overlap if args.cell_barcode_length and args.molecule_barcode_length: @@ -807,23 +810,23 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde """ tag_generators = [] - barcode_args = {"fastq_files": r1} + barcode_args = {'fastq_files': r1} if i1: - barcode_args["embedded_barcodes"] = [cls.sample_barcode] + barcode_args['embedded_barcodes'] = [cls.sample_barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) if whitelist: - barcode_args["whitelist"] = whitelist + barcode_args['whitelist'] = whitelist if cls.cell_barcode: - barcode_args["embedded_cell_barcode"] = cls.cell_barcode + barcode_args['embedded_cell_barcode'] = cls.cell_barcode if cls.molecule_barcode: - barcode_args["other_embedded_barcodes"] = cls.molecule_barcode + barcode_args['other_embedded_barcodes'] = cls.molecule_barcode tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args)) else: # for all the barcodes that have a length and starting position specified - barcode_args["embedded_barcodes"] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode] + barcode_args['embedded_barcodes'] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode] tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) return tag_generators @@ -865,34 +868,34 @@ def attach_barcodes(cls, args=None): parser.add_argument('--i1', default=None, help='(optional) i7 index fastq file') - parser.add_argument("--sample-barcode-start-position", - dest="sample_barcode_start_pos", + parser.add_argument('--sample-barcode-start-position', + dest='sample_barcode_start_pos', default=None, help='the user defined start position (base pairs) of the sample barcode', type=cls._validate_barcode_start_pos) - parser.add_argument("--sample-barcode-length", - dest="sample_barcode_length", + parser.add_argument('--sample-barcode-length', + dest='sample_barcode_length', default=None, help='the user defined length (base pairs) of the sample barcode', type=cls._validate_barcode_length) - parser.add_argument("--cell-barcode-start-position", - dest="cell_barcode_start_pos", + parser.add_argument('--cell-barcode-start-position', + dest='cell_barcode_start_pos', default=None, help='the user defined start position, in base pairs, of the cell barcode', type=cls._validate_barcode_start_pos) - parser.add_argument("--cell-barcode-length", - dest="cell_barcode_length", + parser.add_argument('--cell-barcode-length', + dest='cell_barcode_length', default=None, help='the user defined length, in base pairs, of the cell barcode', type=cls._validate_barcode_length) - parser.add_argument("--molecule-barcode-start-position", - dest="molecule_barcode_start_pos", + parser.add_argument('--molecule-barcode-start-position', + dest='molecule_barcode_start_pos', default=None, help='the user defined start position, in base pairs, of the molecule barcode ' '(must be not overlap cell barcode if cell barcode is provided)', type=cls._validate_barcode_start_pos) - parser.add_argument("--molecule-barcode-length", - dest="molecule_barcode_length", + parser.add_argument('--molecule-barcode-length', + dest='molecule_barcode_length', default=None, help='the user defined length, in base pairs, of the molecule barcode', type=cls._validate_barcode_length) From 3b302d8b3793c45cb6efa0762792374659bd8008 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Mon, 25 Feb 2019 12:09:46 -0500 Subject: [PATCH 13/15] Updated doc strings, error handling --- src/sctools/platform.py | 109 ++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index c99781c..d7b36ed 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -632,7 +632,7 @@ def attach_barcodes(cls, args=None): class BarcodePlatform(GenericPlatform): - """Command Line Interface for 10x Genomics v2 RNA-sequencing programs + """Command Line Interface for extracting and attaching barcodes with specified positions This class defines several methods that are created as CLI tools when sctools is installed (see setup.py) @@ -660,67 +660,30 @@ class BarcodePlatform(GenericPlatform): molecule_barcode = None sample_barcode = None - @classmethod - def _validate_barcode_input(cls, given_value, min_value): - """Validates that the barcode input is greater than a min value - - Parameters - ---------- - given_value : int - the given value that must be greater than the min_value, - (barcode length or barcode starting position) - - min_value : int - the min value that the given_value must be greater than - - Returns - ------- - given_value : int - return given value if valid - - """ - if given_value < min_value: - raise argparse.ArgumentTypeError('Invalid barcode length/position') - return given_value - @classmethod def _validate_barcode_args(cls, args): """Validates that the barcode start position is greater than 0 Parameters ---------- - args : Iterable[str] + args : object arguments list, The default value of None, when passed to `parser.parse_args` causes the parser to read `sys.argv` Returns ------- - args : Iterable[str] + args : object return arguments list if valid """ - # check that both the barcode length and position are given as arguments - # or that neither barcode length and position are given as arguments (XOR boolean logic) - cell_barcode_start_pos_exists = bool(args.cell_barcode_start_pos) or (args.cell_barcode_start_pos == 0) - cell_barcode_length_exists = bool(args.cell_barcode_length) - if (cell_barcode_start_pos_exists != cell_barcode_length_exists): - argparse.ArgumentError('Invalid cell barocde position/length, both position and length must be provided by the user together') - - molecule_barcode_start_pos_exists = bool(args.molecule_barcode_start_pos) or (args.molecule_barcode_start_pos == 0) - molecule_barcode_length_exists = bool(args.molecule_barcode_length) - if (molecule_barcode_start_pos_exists != molecule_barcode_length_exists): - argparse.ArgumentError('Invalid molecule barocde position/length, both position and length must be provided by the user together') - - - sample_barcode_start_pos_exists = bool(args.sample_barcode_start_pos) or (args.sample_barcode_start_pos == 0) - sample_barcode_length_exists = bool(args.sample_barcode_length) - if (sample_barcode_start_pos_exists != sample_barcode_length_exists): - argparse.ArgumentError('Invalid sample barocde position/length, both position and length must be provided by the user together') - + # check that if a barcode start position is provided, its length is also (and vice versa) + cls._validate_barcode_length_and_position(args.cell_barcode_start_pos, args.cell_barcode_length) + cls._validate_barcode_length_and_position(args.molecule_barcode_start_pos, args.molecule_barcode_length) + cls._validate_barcode_length_and_position(args.sample_barcode_start_pos, args.sample_barcode_length) # check that an index fastq is provided sample barcode length and position are given if args.i1 is None and args.sample_barcode_length: - argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode') + raise argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode') # check that cell and molecule barcodes don't overlap if args.cell_barcode_length and args.molecule_barcode_length: @@ -729,13 +692,60 @@ def _validate_barcode_args(cls, args): return args + @classmethod + def _validate_barcode_length_and_position(cls, barcode_start_position, barcode_length): + """Checks that either that both barcode length and position are given or that neither are given as arguments + + Parameters + ---------- + barcode_start_position : int + the user defined start position (base pairs) of the barcode + + barcode_length : int + the user defined length (base pairs) of the barcode + + Returns + ------- + given_value : int + return given value if valid + + """ + barcode_start_pos_exists = bool(barcode_start_position) or (barcode_start_position == 0) + barcode_length_exists = bool(barcode_length) + # (XOR boolean logic) + if (barcode_start_pos_exists != barcode_length_exists): + raise argparse.ArgumentError('Invalid position/length, both position and length must be provided by the user together') + + @classmethod + def _validate_barcode_input(cls, given_value, min_value): + """Validates that the barcode input is greater than a min value + + Parameters + ---------- + given_value : int + the given value that must be greater than the min_value, + (barcode length or barcode starting position) + + min_value : int + the min value that the given_value must be greater than + + Returns + ------- + given_value : int + return given value if valid + + """ + if given_value < min_value: + raise argparse.ArgumentTypeError('Invalid barcode length/position') + return given_value + @classmethod def _validate_barcode_start_pos(cls, given_value): """Validates that the barcode start position is greater than 0 Parameters ---------- - given_value : int + given_value : Union[int, str] the given start position of the barcode to validate Returns @@ -752,7 +762,7 @@ def _validate_barcode_length(cls, given_value): Parameters ---------- - given_value : int + given_value : Union[int, str] the given length of the barcode to validate Returns @@ -791,8 +801,7 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde """Create tag generators from fastq files. Tag generators are iterators that run over fastq records, they extract and yield all of the - barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and - optionally, the sample barcode. + barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode. Parameters ---------- @@ -806,7 +815,7 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde Returns ------- tag_generators : List[EmbeddedBarcodeGenerator] - EmbeddedBarcodeGenerators containing barcodes from 10x fastq records + EmbeddedBarcodeGenerators containing barcodes from the given fastq """ tag_generators = [] From 2f61e02add84dbb996714791b3b15774e7ca6dec Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Thu, 7 Mar 2019 13:08:01 -0500 Subject: [PATCH 14/15] provided descriptions for input files and purpose of class --- src/sctools/platform.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index d7b36ed..b1b946a 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -633,6 +633,11 @@ def attach_barcodes(cls, args=None): class BarcodePlatform(GenericPlatform): """Command Line Interface for extracting and attaching barcodes with specified positions + generalizing TenXV2 attach barcodes + + Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the + corresponding barcode's start position and and length are provided. The sample barcode is extracted + from the index i7 fastq file and the cell/molecule barcode is extracted from the r1 fastq file This class defines several methods that are created as CLI tools when sctools is installed (see setup.py) @@ -806,9 +811,9 @@ def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.Embedde Parameters ---------- r1 : str - forward fastq file + forward fastq file, where possibly the cell and/or molecule barcode is found i1 : str, optional - index fastq file + index fastq file, where the sample barcode is found whitelist : str, optional A file that contains a list of acceptable cell barcodes @@ -859,7 +864,7 @@ def attach_barcodes(cls, args=None): parser = argparse.ArgumentParser() parser.add_argument('--r1', required=True, - help='read 1 fastq file') + help='read 1 fastq file, where the cell and molecule barcode is found') parser.add_argument('--u2', required=True, help='unaligned bam, can be converted from fastq read 2' @@ -876,7 +881,7 @@ def attach_barcodes(cls, args=None): 'whitelisted barcode') parser.add_argument('--i1', default=None, - help='(optional) i7 index fastq file') + help='(optional) i7 index fastq file, where the sample barcode is found') parser.add_argument('--sample-barcode-start-position', dest='sample_barcode_start_pos', default=None, From 4be3e1b7b69fa5ac7c8accfc22fd233b941b4ab8 Mon Sep 17 00:00:00 2001 From: benjamincarlin Date: Mon, 11 Mar 2019 09:45:03 -0400 Subject: [PATCH 15/15] updated class description --- src/sctools/platform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sctools/platform.py b/src/sctools/platform.py index b1b946a..eeb8a25 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -637,7 +637,7 @@ class BarcodePlatform(GenericPlatform): Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the corresponding barcode's start position and and length are provided. The sample barcode is extracted - from the index i7 fastq file and the cell/molecule barcode is extracted from the r1 fastq file + from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file This class defines several methods that are created as CLI tools when sctools is installed (see setup.py)