77import logging
88from multiprocessing import Pool
99from pybedtools import Interval , BedTool
10+ from pybedtools .helpers import chromsizes
1011
11-
12- def filter_chromosomes (input_df , to_filter = None , to_keep = None ):
12+ def filter_chromosomes (coords , to_filter = None , to_keep = None ):
1313 """
14- This function takes as input a pandas DataFrame
15- Parameters:
16- input_df (dataFrame): A pandas dataFrame, the first column is expected to
17- be a chromosome. Example: chr1.
18- to_filter (list): Default None (bool = False), will iterate over list
19- objects and filter the listed chromosomes.
20- ( Default: None, i.e. this condition will not be triggered unless a list
21- is supplied)
22- to_keep (list): Default None, will iterate over list objects and only
23- retain the listed chromosomes.
24- Returns:
25- output_df (dataFrame): The filtered pandas dataFrame
14+ Filter or keep the specified chromosomes
15+
16+ :param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
17+ :type coords: pandas.DataFrame
18+ :param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
19+ :type to_filter: list
20+ :param to_keep: list of chromosomes to keep, mutually exclusive with `to_filter`
21+ :type to_keep: list
22+ :rtype: filtered coordinate dataframe
2623 """
24+
25+ if to_filter and to_keep :
26+ print ("Both to_filter and to_keep are provided, only to_filter will work under this circumstance!" )
27+
2728 if to_filter :
28- output_df = input_df .copy ()
29+ corods_out = coords .copy ()
2930 for chromosome in to_filter :
3031 # note: using the str.contains method to remove all
3132 # contigs; for example: chrUn_JH584304
32- bool_filter = ~ (output_df ['chrom' ].str .contains (chromosome ))
33- output_df = output_df [bool_filter ]
33+ bool_filter = ~ (corods_out ['chrom' ].str .contains (chromosome ))
34+ corods_out = corods_out [bool_filter ]
3435 elif to_keep :
3536 # keep only the to_keep chromosomes:
3637 # note: this is slightly different from to_filter, because
3738 # at a time, if only one chromosome is retained, it can be used
3839 # sequentially.
3940 filtered_chromosomes = []
4041 for chromosome in to_keep :
41- filtered_record = input_df [( input_df ['chrom' ] == chromosome )]
42+ filtered_record = coords [( coords ['chrom' ] == chromosome )]
4243 filtered_chromosomes .append (filtered_record )
4344 # merge the retained chromosomes
44- output_df = pd .concat (filtered_chromosomes )
45- else :
46- output_df = input_df
47- return output_df
48-
49- def get_genome_sizes (genome_sizes_file , to_filter = None , to_keep = None ):
50- """
51- Loads the genome sizes file which should look like this:
52- chr1 45900011
53- chr2 10001401
54- ...
55- chrX 9981013
56- This function parses this file, and saves the resulting intervals file
57- as a BedTools object.
58- "Random" contigs, chrUns and chrMs are filtered out.
59- Parameters:
60- genome_sizes_file (str): (Is in an input to the class,
61- can be downloaded from UCSC genome browser)
62- to_filter (list): Default None (bool = False), will iterate over list
63- objects and filter the listed chromosomes.
64- ( Default: None, i.e. this condition will not be triggered unless a list
65- is supplied)
66- to_keep (list): Default None, will iterate over list objects and only
67- retain the listed chromosomes.
68- Returns:
69- A BedTools (from pybedtools) object containing all the chromosomes,
70- start (0) and stop (chromosome size) positions
71- """
72- genome_sizes = pd .read_csv (genome_sizes_file , sep = '\t ' ,
73- header = None , names = ['chrom' , 'length' ])
74-
75- genome_sizes_filt = filter_chromosomes (genome_sizes , to_filter = to_filter ,
76- to_keep = to_keep )
77-
78- genome_bed_data = []
79- # Note: Modifying this to deal with unexpected (incorrect) edge case \
80- # BedTools shuffle behavior.
81- # While shuffling data, BedTools shuffle is placing certain windows at the \
82- # edge of a chromosome
83- # Why it's doing that is unclear; will open an issue on GitHub.
84- # It's probably placing the "start" co-ordinate within limits of the genome,
85- # with the end coordinate not fitting.
86- # This leads to the fasta file returning an incomplete sequence \
87- # (< 500 base pairs)
88- # This breaks the generator feeding into Model.fit.
89- # Therefore, in the genome sizes file, buffering 550 from the edges
90- # to allow for BedTools shuffle to place window without running of the
91- # chromosome.
92- for chrom , sizes in genome_sizes_filt .values :
93- genome_bed_data .append (Interval (chrom , 0 + 550 , sizes - 550 ))
94- genome_bed_data = BedTool (genome_bed_data )
95- return genome_bed_data
96-
97- def load_chipseq_data (chip_peaks_file , genome_sizes_file , to_filter = None ,
98- to_keep = None ):
99- """
100- Loads the ChIP-seq peaks data.
101- The chip peaks file is an events bed file:
102- chr1:451350
103- chr2:91024
104- ...
105- chrX:870000
106- This file can be constructed using a any peak-caller. We use multiGPS.
107- Also constructs a 1 bp long bedfile for each coordinate and a
108- BedTools object which can be later used to generate
109- negative sets.
110- """
111- chip_seq_data = pd .read_csv (chip_peaks_file , delimiter = ':' , header = None ,
112- names = ['chrom' , 'start' ])
113- chip_seq_data ['end' ] = chip_seq_data ['start' ] + 1
114-
115- chip_seq_data = filter_chromosomes (chip_seq_data , to_filter = to_filter ,
116- to_keep = to_keep )
117-
118- sizes = pd .read_csv (genome_sizes_file , names = ['chrom' , 'chrsize' ],
119- sep = '\t ' )
120-
121- # filtering out any regions that are close enough to the edges to
122- # result in out-of-range windows when applying data augmentation.
123- chrom_sizes_dict = (dict (zip (sizes .chrom , sizes .chrsize )))
124- chip_seq_data ['window_max' ] = chip_seq_data ['end' ] + 500
125- chip_seq_data ['window_min' ] = chip_seq_data ['start' ] - 500
126-
127- chip_seq_data ['chr_limits_upper' ] = chip_seq_data ['chrom' ].map (
128- chrom_sizes_dict )
129- chip_seq_data = chip_seq_data [chip_seq_data ['window_max' ] <=
130- chip_seq_data ['chr_limits_upper' ]]
131- chip_seq_data = chip_seq_data [chip_seq_data ['window_min' ] >= 0 ]
132- chip_seq_data = chip_seq_data [['chrom' , 'start' , 'end' ]]
133-
134- return chip_seq_data
135-
136- def exclusion_regions (blacklist_file , chip_seq_data ):
137- """
138- This function takes as input a bound bed file (from multiGPS).
139- The assumption is that the bed file reports the peak center
140- For example: chr2 45 46
141- It converts these peak centers into 501 base pair windows, and adds them to
142- the exclusion list which will be used when constructing negative sets.
143- It also adds the mm10 blacklisted windows to the exclusion list.
144- Parameters:
145- blacklist_file (str): Path to the blacklist file.
146- chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data
147- Returns:
148- exclusion_windows (BedTool): A bedtools object containing all exclusion windows.
149- bound_exclusion_windows (BedTool): A bedtool object containing only
150- those exclusion windows where there exists a binding site.
151- """
152- temp_chip_file = chip_seq_data .copy () # Doesn't modify OG array.
153- temp_chip_file ['start' ] = temp_chip_file ['start' ] - 250
154- temp_chip_file ['end' ] = temp_chip_file ['end' ] + 250
155-
156- if blacklist_file is None :
157- print ('No blacklist file specified ...' )
158- exclusion_windows = BedTool .from_dataframe (temp_chip_file [['chrom' , 'start' ,'end' ]])
45+ corods_out = pd .concat (filtered_chromosomes )
15946 else :
160- bound_exclusion_windows = BedTool .from_dataframe (temp_chip_file [['chrom' , 'start' ,'end' ]])
161- blacklist_exclusion_windows = BedTool (blacklist_file )
162- exclusion_windows = BedTool .cat (
163- * [blacklist_exclusion_windows , bound_exclusion_windows ])
164- return exclusion_windows
47+ corods_out = coords
48+ return corods_out
16549
16650def make_random_shift (coords , L , buffer = 25 ):
16751 """
@@ -172,12 +56,16 @@ def make_random_shift(coords, L, buffer=25):
17256 If training window length is L, then we must ensure that the
17357 peak center is still within the training window.
17458 Therefore: -L/2 < shift < L/2
175- To add in a buffer: -L/2 + 25 <= shift <= L/2 + 25
176- # Note: The 50 here is a tunable hyper-parameter.
177- Parameters:
178- coords(pandas dataFrame): This is an input bedfile (first 3 column names: "chr", "start", "end")
179- Returns:
180- shifted_coords(pandas dataFrame): The output bedfile with shifted coords
59+ To add in a buffer: -L/2 + buffer <= shift <= L/2 - 25
60+
61+ :param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
62+ :type coords: pandas.DataFrame
63+ :param L: training/shifting window size, the shifted midpoint should fall into the range -L/2 + buffer ~ L/2 + buffer
64+ :type L: integer
65+ :param buffer: buffer size
66+ :type buffer: integer
67+ :rtype: shifted coordinate dataframe
68+
18169 """
18270 low = int (- L / 2 + buffer )
18371 high = int (L / 2 - buffer )
@@ -193,29 +81,41 @@ def make_flank(coords, L, d):
19381 Make flanking regions by:
19482 1. Shift midpoint by d
19583 2. Expand midpoint to upstream/downstream by L/2
84+
85+ :param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
86+ :type coords: pandas.DataFrame
87+ :param L: window size of output regions
88+ :type L: integer
89+ :param d: shifting distance
90+ :type d: integer
91+ :rtype: shifted coordinate dataframe
19692 """
19793 return (coords .assign (midpoint = lambda x : (x ["start" ]+ x ["end" ])/ 2 )
19894 .astype ({"midpoint" : int })
19995 .assign (midpoint = lambda x : x ["midpoint" ] + d )
20096 .apply (lambda s : pd .Series ([s ["chrom" ], int (s ["midpoint" ]- L / 2 ), int (s ["midpoint" ]+ L / 2 )],
20197 index = ["chrom" , "start" , "end" ]), axis = 1 ))
20298
203- def random_coords (gs , incl , excl , l = 500 , n = 1000 ):
204- """
205- Randomly sample n intervals of length l from the genome,
206- shuffle to make all intervals inside the desired regions
207- and outside exclusion regions
208- """
209- return (BedTool ()
210- .random (l = l , n = n , g = gs )
211- .shuffle (g = gs , incl = incl .fn , excl = excl .fn )
212- .to_dataframe ()[["chrom" , "start" , "end" ]])
21399
214- def chop_genome (gs , chroms , excl , stride = 500 , l = 500 ):
100+ def chop_genome (chroms : list = None , excl : BedTool = None , gs = None , genome = None , stride = 500 , l = 500 ):
215101 """
216102 Given a genome size file and chromosome list,
217103 chop these chromosomes into intervals of length l,
218104 with include/exclude regions specified
105+
106+ :param chroms: list of chromosomes to be chopped
107+ :type chroms: list of strings
108+ :param excl: regions that chopped regions shouldn't overlap with
109+ :type excl: BedTool object
110+ :param gs: genome size file path, only one of `gs` and `genome` is required
111+ :type gs: string
112+ :param genome: genome build name, only one of `gs` and `genome` is required
113+ :type genome: string
114+ :param stride: step size that everytime chopped region start coordinate moving forward by
115+ :type stride: int
116+ :param l: interval window size
117+ :type l: int
118+ :rtype: chopped intervals coordinate dataframe
219119 """
220120 def intervals_loop (chrom , start , stride , l , size ):
221121 intervals = []
@@ -226,8 +126,15 @@ def intervals_loop(chrom, start, stride, l, size):
226126 break
227127 start += stride
228128 return pd .DataFrame (intervals , columns = ["chrom" , "start" , "end" ])
229-
230- genome_sizes = (pd .read_csv (gs , sep = "\t " , names = ["chrom" , "len" ])
129+
130+ if genome :
131+ genome_sizes = (pd .DataFrame (chromsizes (genome ))
132+ .T
133+ .rename (columns = {0 :"chrom" , 1 :"len" })
134+ .set_index ("chrom" )
135+ .loc [chroms ])
136+ elif gs :
137+ genome_sizes = (pd .read_csv (gs , sep = "\t " , usecols = [0 ,1 ], names = ["chrom" , "len" ])
231138 .set_index ("chrom" )
232139 .loc [chroms ])
233140 genome_chops = pd .concat ([intervals_loop (i .Index , 0 , stride , l , i .len )
@@ -237,13 +144,6 @@ def intervals_loop(chrom, start, stride, l, size):
237144 return (genome_chops_bdt .intersect (excl , v = True )
238145 .to_dataframe ()[["chrom" , "start" , "end" ]])
239146
240- def clean_bed (coords ):
241- """
242- Clean the bed file:
243- 1. Remove intervals with start < 0
244- """
245- return coords .loc [coords ["start" ]>= 0 ]
246-
247147class DNA2OneHot (object ):
248148 def __init__ (self ):
249149 self .DNA2Index = {
@@ -268,14 +168,22 @@ def __call__(self, dnaSeq):
268168 continue
269169 return seqMatrix
270170
171+ DNA2Index = {
172+ "A" : 0 ,
173+ "C" : 1 ,
174+ "G" : 2 ,
175+ "T" : 3 ,
176+ }
177+
271178def dna2OneHot (dnaSeq ):
272- DNA2Index = {
273- "A" : 0 ,
274- "C" : 1 ,
275- "G" : 2 ,
276- "T" : 3 ,
277- }
179+ """
180+ One-hot code input DNA sequence into an array of shape (4, len)
181+ Mapping is in the order of ACGT
278182
183+ :param dnaSeq: input DNA sequence
184+ :type dnaSeq: string
185+ :rtype: numpy array of shape (4, len)
186+ """
279187 seqLen = len (dnaSeq )
280188 # initialize the matrix as 4 x len(dnaSeq)
281189 seqMatrix = np .zeros ((4 , len (dnaSeq )), dtype = np .float32 )
@@ -291,6 +199,13 @@ def dna2OneHot(dnaSeq):
291199 return seqMatrix
292200
293201def rev_comp (inp_str ):
202+ """
203+ Return reverse complemented sequence
204+
205+ :param input_str: input DNA sequence
206+ :type input_str: string
207+ :rtype: reverse complemented DNA sequence
208+ """
294209 rc_dict = {'A' : 'T' , 'G' : 'C' , 'T' : 'A' , 'C' : 'G' , 'c' : 'g' ,
295210 'g' : 'c' , 't' : 'a' , 'a' : 't' , 'n' : 'n' , 'N' : 'N' }
296211 outp_str = list ()
@@ -347,8 +262,4 @@ def extract_info(chrom, start, end, label, genome_pyfasta, bigwigs, target_bam,
347262 for k ,t in transforms .items ():
348263 feature [k ] = t (feature [k ])
349264
350- return feature
351-
352- if __name__ == "__main__" :
353- chip_seq_coordinates = load_chipseq_data ("Bichrom/sample_data/Ascl1.peaks" , genome_sizes_file = "Bichrom/sample_data/mm10.info" ,
354- to_keep = None , to_filter = ['chr17' , 'chr11' , 'chrM' , 'chrUn' ])
265+ return feature
0 commit comments