Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
],
entry_points={
'console_scripts': [
'AttachBarcodes = sctools.platform:BarcodePlatform.attach_barcodes',
'Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes',
'SplitBam = sctools.platform:GenericPlatform.split_bam',
'CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics',
Expand Down
315 changes: 315 additions & 0 deletions src/sctools/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,318 @@ def attach_barcodes(cls, args=None):
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)

return 0


class BarcodePlatform(GenericPlatform):
"""Command Line Interface for extracting and attaching barcodes with specified positions
generalizing TenXV2 attach barcodes

Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the
corresponding barcode's start position and and length are provided. The sample barcode is extracted
from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file

This class defines several methods that are created as CLI tools when sctools is installed
(see setup.py)

Attributes
----------
cell_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the cell barcode and the tags to
assign the sequence and quality of the cell barcode
molecule_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the molecule barcode and the tags
to assign the sequence and quality of the molecule barcode
sample_barcode : fastq.EmbeddedBarcode
A data class that defines the start and end position of the sample barcode and the tags
to assign the sequence and quality of the sample barcode

Methods
-------
attach_barcodes()
Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse
(r2) bam file

"""
cell_barcode = None
molecule_barcode = None
sample_barcode = None

@classmethod
def _validate_barcode_args(cls, args):
"""Validates that the barcode start position is greater than 0

Parameters
----------
args : object
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`

Returns
-------
args : object
return arguments list if valid

"""
# check that if a barcode start position is provided, its length is also (and vice versa)
cls._validate_barcode_length_and_position(args.cell_barcode_start_pos, args.cell_barcode_length)
cls._validate_barcode_length_and_position(args.molecule_barcode_start_pos, args.molecule_barcode_length)
cls._validate_barcode_length_and_position(args.sample_barcode_start_pos, args.sample_barcode_length)

# check that an index fastq is provided sample barcode length and position are given
if args.i1 is None and args.sample_barcode_length:
raise argparse.ArgumentError('An i7 index fastq file must be given to attach a sample barcode')

# check that cell and molecule barcodes don't overlap
if args.cell_barcode_length and args.molecule_barcode_length:
cls._validate_barcode_input(args.molecule_barcode_start_pos,
args.cell_barcode_start_pos + args.cell_barcode_length)

return args

@classmethod
def _validate_barcode_length_and_position(cls, barcode_start_position, barcode_length):
"""Checks that either that both barcode length and position are given or that neither are given as arguments

Parameters
----------
barcode_start_position : int
the user defined start position (base pairs) of the barcode

barcode_length : int
the user defined length (base pairs) of the barcode

Returns
-------
given_value : int
return given value if valid

"""
barcode_start_pos_exists = bool(barcode_start_position) or (barcode_start_position == 0)
barcode_length_exists = bool(barcode_length)
# (XOR boolean logic)
if (barcode_start_pos_exists != barcode_length_exists):
raise argparse.ArgumentError('Invalid position/length, both position and length must be provided by the user together')

@classmethod
def _validate_barcode_input(cls, given_value, min_value):
"""Validates that the barcode input is greater than a min value

Parameters
----------
given_value : int
the given value that must be greater than the min_value,
(barcode length or barcode starting position)

min_value : int
the min value that the given_value must be greater than

Returns
-------
given_value : int
return given value if valid

"""
if given_value < min_value:
raise argparse.ArgumentTypeError('Invalid barcode length/position')
return given_value

@classmethod
def _validate_barcode_start_pos(cls, given_value):
"""Validates that the barcode start position is greater than 0

Parameters
----------
given_value : Union[int, str]
the given start position of the barcode to validate

Returns
-------
given_value : int
returns the start position if it is valid

"""
return cls._validate_barcode_input(int(given_value), 0)

@classmethod
def _validate_barcode_length(cls, given_value):
"""Validates that the barcode length is greater than 1

Parameters
----------
given_value : Union[int, str]
the given length of the barcode to validate

Returns
-------
given_value : int
returns the length if it is valid

"""
return cls._validate_barcode_input(int(given_value), 1)

@classmethod
def _tag_bamfile(cls,
input_bamfile_name: str,
output_bamfile_name: str,
tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator]) -> None:
"""Adds tags from fastq file(s) to a bam file.

Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from
`input_bamfile_name`, and writes the result to `output_bamfile_name`

Parameters
----------
input_bamfile_name : str
input bam
output_bamfile_name : str
output bam
tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator]
Iterable of generators that yield barcodes from fastq files

"""
bam_tagger = bam.Tagger(input_bamfile_name)
bam_tagger.tag(output_bamfile_name, tag_generators)

@classmethod
def _make_tag_generators(cls, r1, i1=None, whitelist=None) -> List[fastq.EmbeddedBarcodeGenerator]:
"""Create tag generators from fastq files.

Tag generators are iterators that run over fastq records, they extract and yield all of the
barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode.

Parameters
----------
r1 : str
forward fastq file, where possibly the cell and/or molecule barcode is found
i1 : str, optional
index fastq file, where the sample barcode is found
whitelist : str, optional
A file that contains a list of acceptable cell barcodes

Returns
-------
tag_generators : List[EmbeddedBarcodeGenerator]
EmbeddedBarcodeGenerators containing barcodes from the given fastq

"""
tag_generators = []
barcode_args = {'fastq_files': r1}

if i1:
barcode_args['embedded_barcodes'] = [cls.sample_barcode]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args))

if whitelist:
barcode_args['whitelist'] = whitelist
if cls.cell_barcode:
barcode_args['embedded_cell_barcode'] = cls.cell_barcode
if cls.molecule_barcode:
barcode_args['other_embedded_barcodes'] = cls.molecule_barcode
tag_generators.append(fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args))

else:
# for all the barcodes that have a length and starting position specified
barcode_args['embedded_barcodes'] = [barcode for barcode in [cls.cell_barcode, cls.molecule_barcode] if barcode]
tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args))

return tag_generators

@classmethod
def attach_barcodes(cls, args=None):
"""Command line entrypoint for attaching barcodes to a bamfile.

Parameters
----------
args : Iterable[str], optional
arguments list, The default value of None, when passed to `parser.parse_args`
causes the parser to read `sys.argv`

Returns
-------
return_call : 0
return call if the program completes successfully

"""
parser = argparse.ArgumentParser()
parser.add_argument('--r1',
required=True,
help='read 1 fastq file, where the cell and molecule barcode is found')
parser.add_argument('--u2',
required=True,
help='unaligned bam, can be converted from fastq read 2'
'using picard FastqToSam')
parser.add_argument('-o',
'--output-bamfile',
required=True,
help='filename for tagged bam')
parser.add_argument('-w',
'--whitelist',
default=None,
help='optional cell barcode whitelist. If provided, corrected barcodes '
'will also be output when barcodes are observed within 1ED of a '
'whitelisted barcode')
parser.add_argument('--i1',
default=None,
help='(optional) i7 index fastq file, where the sample barcode is found')
parser.add_argument('--sample-barcode-start-position',
dest='sample_barcode_start_pos',
default=None,
help='the user defined start position (base pairs) of the sample barcode',
type=cls._validate_barcode_start_pos)
parser.add_argument('--sample-barcode-length',
dest='sample_barcode_length',
default=None,
help='the user defined length (base pairs) of the sample barcode',
type=cls._validate_barcode_length)
parser.add_argument('--cell-barcode-start-position',
dest='cell_barcode_start_pos',
default=None,
help='the user defined start position, in base pairs, of the cell barcode',
type=cls._validate_barcode_start_pos)
parser.add_argument('--cell-barcode-length',
dest='cell_barcode_length',
default=None,
help='the user defined length, in base pairs, of the cell barcode',
type=cls._validate_barcode_length)
parser.add_argument('--molecule-barcode-start-position',
dest='molecule_barcode_start_pos',
default=None,
help='the user defined start position, in base pairs, of the molecule barcode '
'(must be not overlap cell barcode if cell barcode is provided)',
type=cls._validate_barcode_start_pos)
parser.add_argument('--molecule-barcode-length',
dest='molecule_barcode_length',
default=None,
help='the user defined length, in base pairs, of the molecule barcode',
type=cls._validate_barcode_length)

# parse and validate the args
if args:
args = parser.parse_args(args)
else:
args = parser.parse_args()
cls._validate_barcode_args(args)

# if the length and there for the start pos have been given as args
# get the appropriate barcodes
if args.cell_barcode_length:
cls.cell_barcode = fastq.EmbeddedBarcode(start=args.cell_barcode_start_pos,
end=args.cell_barcode_start_pos + args.cell_barcode_length,
quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY)
if args.molecule_barcode_length:
cls.molecule_barcode = fastq.EmbeddedBarcode(start=args.molecule_barcode_start_pos,
end=args.molecule_barcode_start_pos + args.molecule_barcode_length,
quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY)
if args.sample_barcode_length:
cls.sample_barcode = fastq.EmbeddedBarcode(start=args.sample_barcode_start_pos,
end=args.sample_barcode_start_pos + args.sample_barcode_length,
quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY,
sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY)

# make the tags and attach the barcodes
tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist)
cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators)

return 0