From 83c4cb20dd3ab47e43d8d683c990fb366ca5dd28 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:09:51 -0400 Subject: [PATCH 1/8] add: docstring and sphix document --- docs/Makefile | 20 ++++++++++ docs/make.bat | 35 +++++++++++++++++ docs/source/conf.py | 68 +++++++++++++++++++++++++++++++++ docs/source/index.rst | 37 ++++++++++++++++++ docs/source/install.rst | 20 ++++++++++ docs/source/usage.rst | 82 ++++++++++++++++++++++++++++++++++++++++ seqchromloader/loader.py | 23 +++++++++++ seqchromloader/writer.py | 22 ++++++++++- 8 files changed, 305 insertions(+), 2 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/install.rst create mode 100644 docs/source/usage.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..ecbd354 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,68 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import pathlib +import sys +sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix()) + +# import MOCK +autodoc_mock_imports = ["torch", + "pysam", + "pybedtools", + "pyfasta", + "pyBigWig", + "pytorch_lightning", + "webdataset"] + +# -- Project information ----------------------------------------------------- + +project = 'seqchromloader' +copyright = '2023, Jianyu Yang' +author = 'Jianyu Yang' + +# The full version, including alpha/beta/rc tags +release = '0.2.4' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.autodoc' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..10e2ce2 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,37 @@ +.. seqchromloader documentation master file, created by + sphinx-quickstart on Mon Mar 20 20:12:15 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to seqchromloader's documentation! +========================================== + +seqchromloader aims to provide versatile and ready-to-use writer/loader for applying deep learning to bioinformatics study. + +Plan to support dataset formats including: + +* webdataset (done) +* tfrecord (x) + +Training framework support: + +* pytorch dataloader (done) +* pytorch-lightning datamodule (done) +* NVIDIA-DALI (x) + +Check out the :doc:`usage` section for further information, including how to +:doc:`install` the project. + +.. toctree:: + install + usage + :maxdepth: 2 + :caption: Contents: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 0000000..417d582 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,20 @@ +Installation +============ + +conda (suggested): + +.. code-block:: console + + mamba install -c bioconda -c conda-forge seqchromloader + +or + +.. code-block:: console + + mamba install -c bioconda -c conda-forge seqchromloader + +pip + +.. code-block:: console + + pip install seqchromloader diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 0000000..5fcf797 --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,82 @@ +Usage +===== + +``seqchromloader`` is composed of two types of functions: ``writer`` and ``loader``. You can use ``writer`` to dump dataset into webdataset format file for future use, or directly call ``loader`` to get tensors immediately. + +Generally ``seqchromloader`` would produce four kinds of tensors: **[seq, chrom, target, label]** + +* **seq** is one-hot coded DNA sequence tensor of shape *[batch_size, 4, len]* using the DNA mapping order of "ACGT" (which means, A = [1,0,0,0], C = [0,1,0,0], ...) +* **chrom** is chromatin track tensor of shape *[batch_size, # tracks, len]*, chromatin track bigwig files are usually provided by ``bigwig_filelist`` parameter +* **target** is the tensor representing the number of sequencing reads in the region, this is from the bam file given by ``target_bam`` parameter +* **label** is the integer label of each sample, when given bed file input, this info would be from the fourth column. While given a pandas DataFrame, it should have a column named *label* + +Writer +------ + +Currently only webdataset format is supported, you can write tensors into webdataset in this way: + +.. code-block:: python3 + + import pandas as pd + from seqchromloader import dump_data_webdataset + + coords = pd.DataFrame({ + "chrom": ["chr1", "chr10"], + "start": [1000, 5000], + "end": [1200, 5200], + "label": [0, 1] + }) + wds_file_lists = dump_data_webdataset(coords, + genome_fasta="mm10.fa", + bigwig_filelist=["h3k4me3.bw", "atacseq.bw"], + outdir="dataset/" + outprefix="test", + compress=True, + numPorcessors=4, + transforms={"chrom": lambda x: x+1}) + +.. note:: + Each region should be of the same length! As in this example, every region is 200bp long. + +The returned ``wds_file_lists`` contain the output file paths, every file has ~7000 samples. + +One thing worth noting is the ``transforms`` parameter here, ``transforms`` accepts a dictionary of function, each function will be called on the output that its key refers to. In this example, the add 1 lambda function was called on each ``chrom`` tensor, you can do more complicated transformations in this way, e.g., standardize the tensor. + +Loader +------ + +You can easily load the webdataset files generated by ``seqchromloader.dump_data_webdataset`` above by: + +.. code-block:: python3 + + from seqchromloader import SeqChromDatasetByWds + + dataloader = SeqChromDatasetByWds(wds_file_lists, transforms=None, rank=0, world_size=1) + seq, chrom, target, label = next(iter(dataloader)) + +If you are using multiple GPUs, you can use ``rank`` and ``world_size`` to do sharding on dataset to ensure each GPU getting non-overlapped piece of dataset + +A more straightforward way is using ``seqchromloader.SeqChromDatasetByBed``, which can output tensors given a bed file and other required files. + +.. code-block:: python3 + + from seqchromloader import SeqChromDatasetByBed + + dataloader = SeqChromDatasetByWds(bed="regions.bed", + genome_fasta="mm10.fa", + bigwig_filelist=["h3k4me3.bw", "atacseq.bw"], + target_bam="foxa1.bam", + transforms={"label": lambda x: x-1}, + dataloader_kws={num_workers: 4}) + seq, chrom, target, label = next(iter(dataloader)) + +Here I pass a dictionary describing the keywords arguments would be further passed to ``torch.utils.data.DataLoader`` to increase the number of workers (default is 1), you can refer to `Pytorch DataLoader Document `_ to explore more controls on DataLoader behavior + +API +--- + +.. autofunction:: seqchromloader.dump_data_webdataset + +.. autofunction:: seqchromloader.SeqChromDatasetByBed + +.. autofunction:: seqchromloader.SeqChromDatasetByWds \ No newline at end of file diff --git a/seqchromloader/loader.py b/seqchromloader/loader.py index 419a2ca..6c66e5f 100644 --- a/seqchromloader/loader.py +++ b/seqchromloader/loader.py @@ -26,8 +26,13 @@ def worker_init_fn(worker_id): dataset.initialize() class SeqChromLoader(): + """ + :param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader`` + :type dataloader_kws: dict of kwargs + """ def __init__(self, SeqChromDataset): self.SeqChromDataset = SeqChromDataset + self.__doc__ = self.__doc__ + self.SeqChromDataset.__doc__ def __call__(self, *args, dataloader_kws:dict={}, **kwargs): # default dataloader kws @@ -46,6 +51,12 @@ def seqChromLoaderCurry(SeqChromDataset): return SeqChromLoader(SeqChromDataset) class _SeqChromDatasetByWds(IterableDataset): + """ + :param wds: list of webdataset files to get samples from + :type wds: list of str + :param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]** + :type transforms: dict of functions + """ def __init__(self, wds, transforms:dict=None, rank=0, world_size=1): self.wds = wds self.transforms = transforms @@ -83,6 +94,18 @@ def __iter__(self): SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds) class _SeqChromDatasetByBed(Dataset): + """ + :param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length. + :type bed: str + :param genome_fasta: Genome fasta file. + :type genome_fasta: str + :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications) + :type bigwig_filelist: list of str or None + :param target_bam: bam file to get # reads in each region + :type target_bam: str or None + :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]* + :type transforms: dict of functions + """ def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False): self.bed = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ]) diff --git a/seqchromloader/writer.py b/seqchromloader/writer.py index 8742b3b..d0e5e5b 100644 --- a/seqchromloader/writer.py +++ b/seqchromloader/writer.py @@ -26,8 +26,26 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist, numProcessors=1, transforms=None): """ - Given coordinates dataframe, extract the sequence and chromatin signal, - Then save in **TFReocrd** format + Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format + + :param coords: pandas DataFrame containing genomic coordinates with columns **[chrom, start, end, label]** + :type coords: pandas DataFrame + :param genome_fasta: Genome fasta file. + :type genome_fasta: str + :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications) + :type bigwig_filelist: list of str or None + :param target_bam: bam file to get # reads in each region + :type target_bam: str or None + :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]* + :type transforms: dict of functions + :param outdir: output directory to save files in + :type outdir: str + :param outprefix: prefix of output files + :type outprefix: str + :param compress: whether to compress the output files + :type compress: boolean + :param numProcessors: number of processors + :type numProcessors: int """ # split coordinates and assign chunks to workers From 35caefe1da81394038f54a67949674d32e1f35e1 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:15:08 -0400 Subject: [PATCH 2/8] replace sphinx theme --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ecbd354..7b3a726 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,7 +60,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, From c33c25e7a909c2a0cd59a245485fbb23543009a2 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:18:12 -0400 Subject: [PATCH 3/8] Update conf.py --- docs/source/conf.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7b3a726..ba030f3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,9 +60,19 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +templates_path = ['_templates'] + +# -- Options for HTML output + html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +# html_static_path = ['_static'] \ No newline at end of file From 92d1f4b587cf2782e0f12b4af6d1a441767c0a0f Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:22:20 -0400 Subject: [PATCH 4/8] fix: mock imports --- docs/source/conf.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ba030f3..90cb5e0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,13 +18,18 @@ sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix()) # import MOCK -autodoc_mock_imports = ["torch", - "pysam", - "pybedtools", - "pyfasta", - "pyBigWig", - "pytorch_lightning", - "webdataset"] +from unittest import mock + +# Mock open3d because it fails to build in readthedocs +MOCK_MODULES = ["torch", + "pysam", + "pybedtools", + "pyfasta", + "pyBigWig", + "pytorch_lightning", + "webdataset"] +for mod_name in MOCK_MODULES: + sys.modules[mod_name] = mock.Mock() # -- Project information ----------------------------------------------------- @@ -54,12 +59,6 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# intersphinx_mapping = { 'python': ('https://docs.python.org/3/', None), 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), From a1489e3f4ee268573eecb566b6791b51ba7084bf Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:25:29 -0400 Subject: [PATCH 5/8] fix: mock imports --- docs/source/conf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 90cb5e0..eec861c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,8 +20,10 @@ # import MOCK from unittest import mock -# Mock open3d because it fails to build in readthedocs -MOCK_MODULES = ["torch", +# Mock imports +MOCK_MODULES = ["numpy", + "pandas" + "torch", "pysam", "pybedtools", "pyfasta", From 20e6f2229bc227693c20e9186acd865b084823e2 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:27:02 -0400 Subject: [PATCH 6/8] Update conf.py --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index eec861c..3b44355 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ # Mock imports MOCK_MODULES = ["numpy", - "pandas" + "pandas", "torch", "pysam", "pybedtools", From 32b113c4ef6188df4882b8db3e4516315400b4e3 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:28:14 -0400 Subject: [PATCH 7/8] Update conf.py --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3b44355..5497e24 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,6 +24,7 @@ MOCK_MODULES = ["numpy", "pandas", "torch", + "torch.utils.data", "pysam", "pybedtools", "pyfasta", From 76ed44d50410ba84e1cdbbd13c7538aec57d3cc0 Mon Sep 17 00:00:00 2001 From: Jianyu Yang Date: Mon, 20 Mar 2023 22:47:21 -0400 Subject: [PATCH 8/8] Update conf.py --- docs/source/conf.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5497e24..16dd169 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,18 +21,16 @@ from unittest import mock # Mock imports -MOCK_MODULES = ["numpy", - "pandas", - "torch", - "torch.utils.data", - "pysam", - "pybedtools", - "pyfasta", - "pyBigWig", - "pytorch_lightning", - "webdataset"] -for mod_name in MOCK_MODULES: - sys.modules[mod_name] = mock.Mock() +autodoc_mock_imports = ["numpy", + "pandas", + "torch", + "torch.utils.data", + "pysam", + "pybedtools", + "pyfasta", + "pyBigWig", + "pytorch_lightning", + "webdataset"] # -- Project information -----------------------------------------------------