diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..16dd169 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,78 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import pathlib +import sys +sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix()) + +# import MOCK +from unittest import mock + +# Mock imports +autodoc_mock_imports = ["numpy", + "pandas", + "torch", + "torch.utils.data", + "pysam", + "pybedtools", + "pyfasta", + "pyBigWig", + "pytorch_lightning", + "webdataset"] + +# -- Project information ----------------------------------------------------- + +project = 'seqchromloader' +copyright = '2023, Jianyu Yang' +author = 'Jianyu Yang' + +# The full version, including alpha/beta/rc tags +release = '0.2.4' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.autodoc' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +templates_path = ['_templates'] + +# -- Options for HTML output + +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..10e2ce2 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,37 @@ +.. seqchromloader documentation master file, created by + sphinx-quickstart on Mon Mar 20 20:12:15 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to seqchromloader's documentation! +========================================== + +seqchromloader aims to provide versatile and ready-to-use writer/loader for applying deep learning to bioinformatics study. + +Plan to support dataset formats including: + +* webdataset (done) +* tfrecord (x) + +Training framework support: + +* pytorch dataloader (done) +* pytorch-lightning datamodule (done) +* NVIDIA-DALI (x) + +Check out the :doc:`usage` section for further information, including how to +:doc:`install` the project. + +.. toctree:: + install + usage + :maxdepth: 2 + :caption: Contents: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 0000000..417d582 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,20 @@ +Installation +============ + +conda (suggested): + +.. code-block:: console + + mamba install -c bioconda -c conda-forge seqchromloader + +or + +.. code-block:: console + + mamba install -c bioconda -c conda-forge seqchromloader + +pip + +.. code-block:: console + + pip install seqchromloader diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 0000000..5fcf797 --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,82 @@ +Usage +===== + +``seqchromloader`` is composed of two types of functions: ``writer`` and ``loader``. You can use ``writer`` to dump dataset into webdataset format file for future use, or directly call ``loader`` to get tensors immediately. + +Generally ``seqchromloader`` would produce four kinds of tensors: **[seq, chrom, target, label]** + +* **seq** is one-hot coded DNA sequence tensor of shape *[batch_size, 4, len]* using the DNA mapping order of "ACGT" (which means, A = [1,0,0,0], C = [0,1,0,0], ...) +* **chrom** is chromatin track tensor of shape *[batch_size, # tracks, len]*, chromatin track bigwig files are usually provided by ``bigwig_filelist`` parameter +* **target** is the tensor representing the number of sequencing reads in the region, this is from the bam file given by ``target_bam`` parameter +* **label** is the integer label of each sample, when given bed file input, this info would be from the fourth column. While given a pandas DataFrame, it should have a column named *label* + +Writer +------ + +Currently only webdataset format is supported, you can write tensors into webdataset in this way: + +.. code-block:: python3 + + import pandas as pd + from seqchromloader import dump_data_webdataset + + coords = pd.DataFrame({ + "chrom": ["chr1", "chr10"], + "start": [1000, 5000], + "end": [1200, 5200], + "label": [0, 1] + }) + wds_file_lists = dump_data_webdataset(coords, + genome_fasta="mm10.fa", + bigwig_filelist=["h3k4me3.bw", "atacseq.bw"], + outdir="dataset/" + outprefix="test", + compress=True, + numPorcessors=4, + transforms={"chrom": lambda x: x+1}) + +.. note:: + Each region should be of the same length! As in this example, every region is 200bp long. + +The returned ``wds_file_lists`` contain the output file paths, every file has ~7000 samples. + +One thing worth noting is the ``transforms`` parameter here, ``transforms`` accepts a dictionary of function, each function will be called on the output that its key refers to. In this example, the add 1 lambda function was called on each ``chrom`` tensor, you can do more complicated transformations in this way, e.g., standardize the tensor. + +Loader +------ + +You can easily load the webdataset files generated by ``seqchromloader.dump_data_webdataset`` above by: + +.. code-block:: python3 + + from seqchromloader import SeqChromDatasetByWds + + dataloader = SeqChromDatasetByWds(wds_file_lists, transforms=None, rank=0, world_size=1) + seq, chrom, target, label = next(iter(dataloader)) + +If you are using multiple GPUs, you can use ``rank`` and ``world_size`` to do sharding on dataset to ensure each GPU getting non-overlapped piece of dataset + +A more straightforward way is using ``seqchromloader.SeqChromDatasetByBed``, which can output tensors given a bed file and other required files. + +.. code-block:: python3 + + from seqchromloader import SeqChromDatasetByBed + + dataloader = SeqChromDatasetByWds(bed="regions.bed", + genome_fasta="mm10.fa", + bigwig_filelist=["h3k4me3.bw", "atacseq.bw"], + target_bam="foxa1.bam", + transforms={"label": lambda x: x-1}, + dataloader_kws={num_workers: 4}) + seq, chrom, target, label = next(iter(dataloader)) + +Here I pass a dictionary describing the keywords arguments would be further passed to ``torch.utils.data.DataLoader`` to increase the number of workers (default is 1), you can refer to `Pytorch DataLoader Document `_ to explore more controls on DataLoader behavior + +API +--- + +.. autofunction:: seqchromloader.dump_data_webdataset + +.. autofunction:: seqchromloader.SeqChromDatasetByBed + +.. autofunction:: seqchromloader.SeqChromDatasetByWds \ No newline at end of file diff --git a/seqchromloader/loader.py b/seqchromloader/loader.py index 419a2ca..6c66e5f 100644 --- a/seqchromloader/loader.py +++ b/seqchromloader/loader.py @@ -26,8 +26,13 @@ def worker_init_fn(worker_id): dataset.initialize() class SeqChromLoader(): + """ + :param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader`` + :type dataloader_kws: dict of kwargs + """ def __init__(self, SeqChromDataset): self.SeqChromDataset = SeqChromDataset + self.__doc__ = self.__doc__ + self.SeqChromDataset.__doc__ def __call__(self, *args, dataloader_kws:dict={}, **kwargs): # default dataloader kws @@ -46,6 +51,12 @@ def seqChromLoaderCurry(SeqChromDataset): return SeqChromLoader(SeqChromDataset) class _SeqChromDatasetByWds(IterableDataset): + """ + :param wds: list of webdataset files to get samples from + :type wds: list of str + :param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]** + :type transforms: dict of functions + """ def __init__(self, wds, transforms:dict=None, rank=0, world_size=1): self.wds = wds self.transforms = transforms @@ -83,6 +94,18 @@ def __iter__(self): SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds) class _SeqChromDatasetByBed(Dataset): + """ + :param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length. + :type bed: str + :param genome_fasta: Genome fasta file. + :type genome_fasta: str + :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications) + :type bigwig_filelist: list of str or None + :param target_bam: bam file to get # reads in each region + :type target_bam: str or None + :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]* + :type transforms: dict of functions + """ def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False): self.bed = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ]) diff --git a/seqchromloader/writer.py b/seqchromloader/writer.py index 8742b3b..d0e5e5b 100644 --- a/seqchromloader/writer.py +++ b/seqchromloader/writer.py @@ -26,8 +26,26 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist, numProcessors=1, transforms=None): """ - Given coordinates dataframe, extract the sequence and chromatin signal, - Then save in **TFReocrd** format + Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format + + :param coords: pandas DataFrame containing genomic coordinates with columns **[chrom, start, end, label]** + :type coords: pandas DataFrame + :param genome_fasta: Genome fasta file. + :type genome_fasta: str + :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications) + :type bigwig_filelist: list of str or None + :param target_bam: bam file to get # reads in each region + :type target_bam: str or None + :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]* + :type transforms: dict of functions + :param outdir: output directory to save files in + :type outdir: str + :param outprefix: prefix of output files + :type outprefix: str + :param compress: whether to compress the output files + :type compress: boolean + :param numProcessors: number of processors + :type numProcessors: int """ # split coordinates and assign chunks to workers