Skip to content

Commit a5799e6

Browse files
committed
Clean utility functions and add docs
Remove unnucessary ultility functions, keep the following ones: - filter_chromosomes - make_random_shift - make_flank - chop_genome - dna2OneHot - rev_comp Also sphinx docs was added for them
1 parent f5a558d commit a5799e6

File tree

3 files changed

+102
-181
lines changed

3 files changed

+102
-181
lines changed

docs/source/usage.rst

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ The returned ``wds_file_lists`` contain the output file paths, every file has ~7
4242

4343
One thing worth noting is the ``transforms`` parameter here, ``transforms`` accepts a dictionary of function, each function will be called on the output that its key refers to. In this example, the add 1 lambda function was called on each ``chrom`` tensor, you can do more complicated transformations in this way, e.g., standardize the tensor.
4444

45+
.. autofunction:: seqchromloader.dump_data_webdataset
46+
4547
Loader
4648
------
4749

@@ -72,11 +74,18 @@ A more straightforward way is using ``seqchromloader.SeqChromDatasetByBed``, whi
7274
7375
Here I pass a dictionary describing the keywords arguments would be further passed to ``torch.utils.data.DataLoader`` to increase the number of workers (default is 1), you can refer to `Pytorch DataLoader Document <https://pytorch.org/docs/stable/data.html>`_ to explore more controls on DataLoader behavior
7476

75-
API
76-
---
77+
.. autofunction:: seqchromloader.SeqChromDatasetByBed
78+
79+
.. autofunction:: seqchromloader.SeqChromDatasetByWds
7780

78-
.. autofunction:: seqchromloader.dump_data_webdataset
81+
Utilities
82+
---------
7983

80-
.. autofunction:: seqchromloader.SeqChromDatasetByBed
84+
Utility functions for easily manipulating the genomic coordinates to generate training dataset
8185

82-
.. autofunction:: seqchromloader.SeqChromDatasetByWds
86+
.. autofunction:: seqchromloader.filter_chromosomes
87+
.. autofunction:: seqchromloader.make_random_shift
88+
.. autofunction:: seqchromloader.make_flank
89+
.. autofunction:: seqchromloader.chop_genome
90+
.. autofunction:: seqchromloader.dna2OneHot
91+
.. autofunction:: seqchromloader.rev_comp

seqchromloader/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .loader import SeqChromDatasetByDataFrame, SeqChromDatasetByBed, SeqChromDatasetByWds, SeqChromDataModule
2-
from .writer import dump_data_webdataset, convert_data_webdataset
2+
from .writer import dump_data_webdataset, convert_data_webdataset
3+
from .utils import filter_chromosomes, make_random_shift, make_flank, chop_genome, dna2OneHot, rev_comp

seqchromloader/utils.py

Lines changed: 86 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -7,161 +7,45 @@
77
import logging
88
from multiprocessing import Pool
99
from pybedtools import Interval, BedTool
10+
from pybedtools.helpers import chromsizes
1011

11-
12-
def filter_chromosomes(input_df, to_filter=None, to_keep=None):
12+
def filter_chromosomes(coords, to_filter=None, to_keep=None):
1313
"""
14-
This function takes as input a pandas DataFrame
15-
Parameters:
16-
input_df (dataFrame): A pandas dataFrame, the first column is expected to
17-
be a chromosome. Example: chr1.
18-
to_filter (list): Default None (bool = False), will iterate over list
19-
objects and filter the listed chromosomes.
20-
( Default: None, i.e. this condition will not be triggered unless a list
21-
is supplied)
22-
to_keep (list): Default None, will iterate over list objects and only
23-
retain the listed chromosomes.
24-
Returns:
25-
output_df (dataFrame): The filtered pandas dataFrame
14+
Filter or keep the specified chromosomes
15+
16+
:param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
17+
:type coords: pandas.DataFrame
18+
:param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
19+
:type to_filter: list
20+
:param to_keep: list of chromosomes to keep, mutually exclusive with `to_filter`
21+
:type to_keep: list
22+
:rtype: filtered coordinate dataframe
2623
"""
24+
25+
if to_filter and to_keep:
26+
print("Both to_filter and to_keep are provided, only to_filter will work under this circumstance!")
27+
2728
if to_filter:
28-
output_df = input_df.copy()
29+
corods_out = coords.copy()
2930
for chromosome in to_filter:
3031
# note: using the str.contains method to remove all
3132
# contigs; for example: chrUn_JH584304
32-
bool_filter = ~(output_df['chrom'].str.contains(chromosome))
33-
output_df = output_df[bool_filter]
33+
bool_filter = ~(corods_out['chrom'].str.contains(chromosome))
34+
corods_out = corods_out[bool_filter]
3435
elif to_keep:
3536
# keep only the to_keep chromosomes:
3637
# note: this is slightly different from to_filter, because
3738
# at a time, if only one chromosome is retained, it can be used
3839
# sequentially.
3940
filtered_chromosomes = []
4041
for chromosome in to_keep:
41-
filtered_record = input_df[(input_df['chrom'] == chromosome)]
42+
filtered_record = coords[(coords['chrom'] == chromosome)]
4243
filtered_chromosomes.append(filtered_record)
4344
# merge the retained chromosomes
44-
output_df = pd.concat(filtered_chromosomes)
45-
else:
46-
output_df = input_df
47-
return output_df
48-
49-
def get_genome_sizes(genome_sizes_file, to_filter=None, to_keep=None):
50-
"""
51-
Loads the genome sizes file which should look like this:
52-
chr1 45900011
53-
chr2 10001401
54-
...
55-
chrX 9981013
56-
This function parses this file, and saves the resulting intervals file
57-
as a BedTools object.
58-
"Random" contigs, chrUns and chrMs are filtered out.
59-
Parameters:
60-
genome_sizes_file (str): (Is in an input to the class,
61-
can be downloaded from UCSC genome browser)
62-
to_filter (list): Default None (bool = False), will iterate over list
63-
objects and filter the listed chromosomes.
64-
( Default: None, i.e. this condition will not be triggered unless a list
65-
is supplied)
66-
to_keep (list): Default None, will iterate over list objects and only
67-
retain the listed chromosomes.
68-
Returns:
69-
A BedTools (from pybedtools) object containing all the chromosomes,
70-
start (0) and stop (chromosome size) positions
71-
"""
72-
genome_sizes = pd.read_csv(genome_sizes_file, sep='\t',
73-
header=None, names=['chrom', 'length'])
74-
75-
genome_sizes_filt = filter_chromosomes(genome_sizes, to_filter=to_filter,
76-
to_keep=to_keep)
77-
78-
genome_bed_data = []
79-
# Note: Modifying this to deal with unexpected (incorrect) edge case \
80-
# BedTools shuffle behavior.
81-
# While shuffling data, BedTools shuffle is placing certain windows at the \
82-
# edge of a chromosome
83-
# Why it's doing that is unclear; will open an issue on GitHub.
84-
# It's probably placing the "start" co-ordinate within limits of the genome,
85-
# with the end coordinate not fitting.
86-
# This leads to the fasta file returning an incomplete sequence \
87-
# (< 500 base pairs)
88-
# This breaks the generator feeding into Model.fit.
89-
# Therefore, in the genome sizes file, buffering 550 from the edges
90-
# to allow for BedTools shuffle to place window without running of the
91-
# chromosome.
92-
for chrom, sizes in genome_sizes_filt.values:
93-
genome_bed_data.append(Interval(chrom, 0 + 550, sizes - 550))
94-
genome_bed_data = BedTool(genome_bed_data)
95-
return genome_bed_data
96-
97-
def load_chipseq_data(chip_peaks_file, genome_sizes_file, to_filter=None,
98-
to_keep=None):
99-
"""
100-
Loads the ChIP-seq peaks data.
101-
The chip peaks file is an events bed file:
102-
chr1:451350
103-
chr2:91024
104-
...
105-
chrX:870000
106-
This file can be constructed using a any peak-caller. We use multiGPS.
107-
Also constructs a 1 bp long bedfile for each coordinate and a
108-
BedTools object which can be later used to generate
109-
negative sets.
110-
"""
111-
chip_seq_data = pd.read_csv(chip_peaks_file, delimiter=':', header=None,
112-
names=['chrom', 'start'])
113-
chip_seq_data['end'] = chip_seq_data['start'] + 1
114-
115-
chip_seq_data = filter_chromosomes(chip_seq_data, to_filter=to_filter,
116-
to_keep=to_keep)
117-
118-
sizes = pd.read_csv(genome_sizes_file, names=['chrom', 'chrsize'],
119-
sep='\t')
120-
121-
# filtering out any regions that are close enough to the edges to
122-
# result in out-of-range windows when applying data augmentation.
123-
chrom_sizes_dict = (dict(zip(sizes.chrom, sizes.chrsize)))
124-
chip_seq_data['window_max'] = chip_seq_data['end'] + 500
125-
chip_seq_data['window_min'] = chip_seq_data['start'] - 500
126-
127-
chip_seq_data['chr_limits_upper'] = chip_seq_data['chrom'].map(
128-
chrom_sizes_dict)
129-
chip_seq_data = chip_seq_data[chip_seq_data['window_max'] <=
130-
chip_seq_data['chr_limits_upper']]
131-
chip_seq_data = chip_seq_data[chip_seq_data['window_min'] >= 0]
132-
chip_seq_data = chip_seq_data[['chrom', 'start', 'end']]
133-
134-
return chip_seq_data
135-
136-
def exclusion_regions(blacklist_file, chip_seq_data):
137-
"""
138-
This function takes as input a bound bed file (from multiGPS).
139-
The assumption is that the bed file reports the peak center
140-
For example: chr2 45 46
141-
It converts these peak centers into 501 base pair windows, and adds them to
142-
the exclusion list which will be used when constructing negative sets.
143-
It also adds the mm10 blacklisted windows to the exclusion list.
144-
Parameters:
145-
blacklist_file (str): Path to the blacklist file.
146-
chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data
147-
Returns:
148-
exclusion_windows (BedTool): A bedtools object containing all exclusion windows.
149-
bound_exclusion_windows (BedTool): A bedtool object containing only
150-
those exclusion windows where there exists a binding site.
151-
"""
152-
temp_chip_file = chip_seq_data.copy() # Doesn't modify OG array.
153-
temp_chip_file['start'] = temp_chip_file['start'] - 250
154-
temp_chip_file['end'] = temp_chip_file['end'] + 250
155-
156-
if blacklist_file is None:
157-
print('No blacklist file specified ...')
158-
exclusion_windows = BedTool.from_dataframe(temp_chip_file[['chrom', 'start','end']])
45+
corods_out = pd.concat(filtered_chromosomes)
15946
else:
160-
bound_exclusion_windows = BedTool.from_dataframe(temp_chip_file[['chrom', 'start','end']])
161-
blacklist_exclusion_windows = BedTool(blacklist_file)
162-
exclusion_windows = BedTool.cat(
163-
*[blacklist_exclusion_windows, bound_exclusion_windows])
164-
return exclusion_windows
47+
corods_out = coords
48+
return corods_out
16549

16650
def make_random_shift(coords, L, buffer=25):
16751
"""
@@ -172,12 +56,16 @@ def make_random_shift(coords, L, buffer=25):
17256
If training window length is L, then we must ensure that the
17357
peak center is still within the training window.
17458
Therefore: -L/2 < shift < L/2
175-
To add in a buffer: -L/2 + 25 <= shift <= L/2 + 25
176-
# Note: The 50 here is a tunable hyper-parameter.
177-
Parameters:
178-
coords(pandas dataFrame): This is an input bedfile (first 3 column names: "chr", "start", "end")
179-
Returns:
180-
shifted_coords(pandas dataFrame): The output bedfile with shifted coords
59+
To add in a buffer: -L/2 + buffer <= shift <= L/2 - 25
60+
61+
:param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
62+
:type coords: pandas.DataFrame
63+
:param L: training/shifting window size, the shifted midpoint should fall into the range -L/2 + buffer ~ L/2 + buffer
64+
:type L: integer
65+
:param buffer: buffer size
66+
:type buffer: integer
67+
:rtype: shifted coordinate dataframe
68+
18169
"""
18270
low = int(-L/2 + buffer)
18371
high = int(L/2 - buffer)
@@ -193,29 +81,41 @@ def make_flank(coords, L, d):
19381
Make flanking regions by:
19482
1. Shift midpoint by d
19583
2. Expand midpoint to upstream/downstream by L/2
84+
85+
:param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
86+
:type coords: pandas.DataFrame
87+
:param L: window size of output regions
88+
:type L: integer
89+
:param d: shifting distance
90+
:type d: integer
91+
:rtype: shifted coordinate dataframe
19692
"""
19793
return (coords.assign(midpoint=lambda x: (x["start"]+x["end"])/2)
19894
.astype({"midpoint": int})
19995
.assign(midpoint=lambda x: x["midpoint"] + d)
20096
.apply(lambda s: pd.Series([s["chrom"], int(s["midpoint"]-L/2), int(s["midpoint"]+L/2)],
20197
index=["chrom", "start", "end"]), axis=1))
20298

203-
def random_coords(gs, incl, excl, l=500, n=1000):
204-
"""
205-
Randomly sample n intervals of length l from the genome,
206-
shuffle to make all intervals inside the desired regions
207-
and outside exclusion regions
208-
"""
209-
return (BedTool()
210-
.random(l=l, n=n, g=gs)
211-
.shuffle(g=gs, incl=incl.fn, excl=excl.fn)
212-
.to_dataframe()[["chrom", "start", "end"]])
21399

214-
def chop_genome(gs, chroms, excl, stride=500, l=500):
100+
def chop_genome(chroms:list=None, excl:BedTool=None, gs=None, genome=None, stride=500, l=500):
215101
"""
216102
Given a genome size file and chromosome list,
217103
chop these chromosomes into intervals of length l,
218104
with include/exclude regions specified
105+
106+
:param chroms: list of chromosomes to be chopped
107+
:type chroms: list of strings
108+
:param excl: regions that chopped regions shouldn't overlap with
109+
:type excl: BedTool object
110+
:param gs: genome size file path, only one of `gs` and `genome` is required
111+
:type gs: string
112+
:param genome: genome build name, only one of `gs` and `genome` is required
113+
:type genome: string
114+
:param stride: step size that everytime chopped region start coordinate moving forward by
115+
:type stride: int
116+
:param l: interval window size
117+
:type l: int
118+
:rtype: chopped intervals coordinate dataframe
219119
"""
220120
def intervals_loop(chrom, start, stride, l, size):
221121
intervals = []
@@ -226,8 +126,15 @@ def intervals_loop(chrom, start, stride, l, size):
226126
break
227127
start += stride
228128
return pd.DataFrame(intervals, columns=["chrom", "start", "end"])
229-
230-
genome_sizes = (pd.read_csv(gs, sep="\t", names=["chrom", "len"])
129+
130+
if genome:
131+
genome_sizes = (pd.DataFrame(chromsizes(genome))
132+
.T
133+
.rename(columns={0:"chrom", 1:"len"})
134+
.set_index("chrom")
135+
.loc[chroms])
136+
elif gs:
137+
genome_sizes = (pd.read_csv(gs, sep="\t", usecols=[0,1], names=["chrom", "len"])
231138
.set_index("chrom")
232139
.loc[chroms])
233140
genome_chops = pd.concat([intervals_loop(i.Index, 0, stride, l, i.len)
@@ -237,13 +144,6 @@ def intervals_loop(chrom, start, stride, l, size):
237144
return (genome_chops_bdt.intersect(excl, v=True)
238145
.to_dataframe()[["chrom", "start", "end"]])
239146

240-
def clean_bed(coords):
241-
"""
242-
Clean the bed file:
243-
1. Remove intervals with start < 0
244-
"""
245-
return coords.loc[coords["start"]>=0]
246-
247147
class DNA2OneHot(object):
248148
def __init__(self):
249149
self.DNA2Index = {
@@ -268,14 +168,22 @@ def __call__(self, dnaSeq):
268168
continue
269169
return seqMatrix
270170

171+
DNA2Index = {
172+
"A": 0,
173+
"C": 1,
174+
"G": 2,
175+
"T": 3,
176+
}
177+
271178
def dna2OneHot(dnaSeq):
272-
DNA2Index = {
273-
"A": 0,
274-
"C": 1,
275-
"G": 2,
276-
"T": 3,
277-
}
179+
"""
180+
One-hot code input DNA sequence into an array of shape (4, len)
181+
Mapping is in the order of ACGT
278182
183+
:param dnaSeq: input DNA sequence
184+
:type dnaSeq: string
185+
:rtype: numpy array of shape (4, len)
186+
"""
279187
seqLen = len(dnaSeq)
280188
# initialize the matrix as 4 x len(dnaSeq)
281189
seqMatrix = np.zeros((4, len(dnaSeq)), dtype=np.float32)
@@ -291,6 +199,13 @@ def dna2OneHot(dnaSeq):
291199
return seqMatrix
292200

293201
def rev_comp(inp_str):
202+
"""
203+
Return reverse complemented sequence
204+
205+
:param input_str: input DNA sequence
206+
:type input_str: string
207+
:rtype: reverse complemented DNA sequence
208+
"""
294209
rc_dict = {'A': 'T', 'G': 'C', 'T': 'A', 'C': 'G', 'c': 'g',
295210
'g': 'c', 't': 'a', 'a': 't', 'n': 'n', 'N': 'N'}
296211
outp_str = list()
@@ -347,8 +262,4 @@ def extract_info(chrom, start, end, label, genome_pyfasta, bigwigs, target_bam,
347262
for k,t in transforms.items():
348263
feature[k] = t(feature[k])
349264

350-
return feature
351-
352-
if __name__ == "__main__":
353-
chip_seq_coordinates = load_chipseq_data("Bichrom/sample_data/Ascl1.peaks", genome_sizes_file="Bichrom/sample_data/mm10.info",
354-
to_keep=None, to_filter=['chr17', 'chr11', 'chrM', 'chrUn'])
265+
return feature

0 commit comments

Comments
 (0)