From 83c4cb20dd3ab47e43d8d683c990fb366ca5dd28 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:09:51 -0400
Subject: [PATCH 1/8] add: docstring and sphix document

---
 docs/Makefile            | 20 ++++++++++
 docs/make.bat            | 35 +++++++++++++++++
 docs/source/conf.py      | 68 +++++++++++++++++++++++++++++++++
 docs/source/index.rst    | 37 ++++++++++++++++++
 docs/source/install.rst  | 20 ++++++++++
 docs/source/usage.rst    | 82 ++++++++++++++++++++++++++++++++++++++++
 seqchromloader/loader.py | 23 +++++++++++
 seqchromloader/writer.py | 22 ++++++++++-
 8 files changed, 305 insertions(+), 2 deletions(-)
 create mode 100644 docs/Makefile
 create mode 100644 docs/make.bat
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/index.rst
 create mode 100644 docs/source/install.rst
 create mode 100644 docs/source/usage.rst

diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..9534b01
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..ecbd354
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,68 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import pathlib
+import sys
+sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
+
+# import MOCK
+autodoc_mock_imports = ["torch",
+                        "pysam",
+                        "pybedtools",
+                        "pyfasta",
+                        "pyBigWig",
+                        "pytorch_lightning",
+                        "webdataset"]
+
+# -- Project information -----------------------------------------------------
+
+project = 'seqchromloader'
+copyright = '2023, Jianyu Yang'
+author = 'Jianyu Yang'
+
+# The full version, including alpha/beta/rc tags
+release = '0.2.4'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.autodoc'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..10e2ce2
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,37 @@
+.. seqchromloader documentation master file, created by
+   sphinx-quickstart on Mon Mar 20 20:12:15 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to seqchromloader's documentation!
+==========================================
+
+seqchromloader aims to provide versatile and ready-to-use writer/loader for applying deep learning to bioinformatics study.
+
+Plan to support dataset formats including:
+
+* webdataset (done)
+* tfrecord (x)
+
+Training framework support:
+
+* pytorch dataloader (done)
+* pytorch-lightning datamodule (done)
+* NVIDIA-DALI (x)
+
+Check out the :doc:`usage` section for further information, including how to
+:doc:`install` the project.
+
+.. toctree::
+   install
+   usage
+   :maxdepth: 2
+   :caption: Contents:
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/install.rst b/docs/source/install.rst
new file mode 100644
index 0000000..417d582
--- /dev/null
+++ b/docs/source/install.rst
@@ -0,0 +1,20 @@
+Installation
+============
+
+conda (suggested):
+
+.. code-block:: console
+
+    mamba install -c bioconda -c conda-forge seqchromloader
+
+or
+
+.. code-block:: console
+
+    mamba install -c bioconda -c conda-forge seqchromloader
+
+pip
+
+.. code-block:: console
+
+    pip install seqchromloader
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000..5fcf797
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,82 @@
+Usage
+=====
+
+``seqchromloader`` is composed of two types of functions: ``writer`` and ``loader``. You can use ``writer`` to dump dataset into webdataset format file for future use, or directly call ``loader`` to get tensors immediately.
+
+Generally ``seqchromloader`` would produce four kinds of tensors: **[seq, chrom, target, label]**
+
+* **seq** is one-hot coded DNA sequence tensor of shape *[batch_size, 4, len]* using the DNA mapping order of "ACGT" (which means, A = [1,0,0,0], C = [0,1,0,0], ...)
+* **chrom** is chromatin track tensor of shape *[batch_size, # tracks, len]*, chromatin track bigwig files are usually provided by ``bigwig_filelist`` parameter
+* **target** is the tensor representing the number of sequencing reads in the region, this is from the bam file given by ``target_bam`` parameter
+* **label** is the integer label of each sample, when given bed file input, this info would be from the fourth column. While given a pandas DataFrame, it should have a column named *label*
+
+Writer
+------
+
+Currently only webdataset format is supported, you can write tensors into webdataset in this way:
+
+.. code-block:: python3
+
+    import pandas as pd
+    from seqchromloader import dump_data_webdataset
+
+    coords = pd.DataFrame({
+                "chrom": ["chr1", "chr10"],
+                "start": [1000, 5000],
+                "end": [1200, 5200],
+                "label": [0, 1]
+            })
+    wds_file_lists = dump_data_webdataset(coords,
+                                     genome_fasta="mm10.fa",
+                                     bigwig_filelist=["h3k4me3.bw", "atacseq.bw"],
+                                     outdir="dataset/"
+                                     outprefix="test",
+                                     compress=True,
+                                     numPorcessors=4,
+                                     transforms={"chrom": lambda x: x+1})
+
+.. note:: 
+    Each region should be of the same length! As in this example, every region is 200bp long.
+
+The returned ``wds_file_lists`` contain the output file paths, every file has ~7000 samples.
+
+One thing worth noting is the ``transforms`` parameter here, ``transforms`` accepts a dictionary of function, each function will be called on the output that its key refers to. In this example, the add 1 lambda function was called on each ``chrom`` tensor, you can do more complicated transformations in this way, e.g., standardize the tensor.
+
+Loader
+------
+
+You can easily load the webdataset files generated by ``seqchromloader.dump_data_webdataset`` above by:
+
+.. code-block:: python3
+
+    from seqchromloader import SeqChromDatasetByWds
+
+    dataloader = SeqChromDatasetByWds(wds_file_lists, transforms=None, rank=0, world_size=1)
+    seq, chrom, target, label = next(iter(dataloader))
+
+If you are using multiple GPUs, you can use ``rank`` and ``world_size`` to do sharding on dataset to ensure each GPU getting non-overlapped piece of dataset
+
+A more straightforward way is using ``seqchromloader.SeqChromDatasetByBed``, which can output tensors given a bed file and other required files.
+
+.. code-block:: python3
+
+    from seqchromloader import SeqChromDatasetByBed
+
+    dataloader = SeqChromDatasetByWds(bed="regions.bed",
+                                      genome_fasta="mm10.fa",
+                                      bigwig_filelist=["h3k4me3.bw", "atacseq.bw"],
+                                      target_bam="foxa1.bam",
+                                      transforms={"label": lambda x: x-1},
+                                      dataloader_kws={num_workers: 4})
+    seq, chrom, target, label = next(iter(dataloader))
+
+Here I pass a dictionary describing the keywords arguments would be further passed to ``torch.utils.data.DataLoader`` to increase the number of workers (default is 1), you can refer to `Pytorch DataLoader Document <https://pytorch.org/docs/stable/data.html>`_ to explore more controls on DataLoader behavior
+
+API
+---
+
+.. autofunction:: seqchromloader.dump_data_webdataset
+
+.. autofunction:: seqchromloader.SeqChromDatasetByBed
+
+.. autofunction:: seqchromloader.SeqChromDatasetByWds
\ No newline at end of file
diff --git a/seqchromloader/loader.py b/seqchromloader/loader.py
index 419a2ca..6c66e5f 100644
--- a/seqchromloader/loader.py
+++ b/seqchromloader/loader.py
@@ -26,8 +26,13 @@ def worker_init_fn(worker_id):
     dataset.initialize()
 
 class SeqChromLoader():
+    """
+    :param dataloader_kws: keyword arguments passed to ``torch.utils.data.DataLoader``
+    :type dataloader_kws: dict of kwargs
+    """
     def __init__(self, SeqChromDataset):
         self.SeqChromDataset = SeqChromDataset
+        self.__doc__ = self.__doc__ + self.SeqChromDataset.__doc__
 
     def __call__(self, *args, dataloader_kws:dict={}, **kwargs):
         # default dataloader kws
@@ -46,6 +51,12 @@ def seqChromLoaderCurry(SeqChromDataset):
     return SeqChromLoader(SeqChromDataset)
 
 class _SeqChromDatasetByWds(IterableDataset):
+    """
+    :param wds: list of webdataset files to get samples from
+    :type wds: list of str
+    :param transforms: A dictionary of functions to transform the output data, accepted keys are **["seq", "chrom", "target", "label"]**
+    :type transforms: dict of functions
+    """
     def __init__(self, wds, transforms:dict=None, rank=0, world_size=1):
         self.wds = wds
         self.transforms = transforms
@@ -83,6 +94,18 @@ def __iter__(self):
 SeqChromDatasetByWds = seqChromLoaderCurry(_SeqChromDatasetByWds)
 
 class _SeqChromDatasetByBed(Dataset):
+    """
+    :param bed: Bed file describing genomics regions to extract info from, every region has to be of the same length.
+    :type bed: str
+    :param genome_fasta: Genome fasta file.
+    :type genome_fasta: str
+    :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
+    :type bigwig_filelist: list of str or None
+    :param target_bam: bam file to get # reads in each region
+    :type target_bam: str or None
+    :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
+    :type transforms: dict of functions
+    """
     def __init__(self, bed, genome_fasta, bigwig_filelist:list, target_bam=None, transforms:dict=None, initialize_first=False):
         self.bed = pd.read_table(bed, header=None, names=['chrom', 'start', 'end', 'label', 'score', 'strand' ])
 
diff --git a/seqchromloader/writer.py b/seqchromloader/writer.py
index 8742b3b..d0e5e5b 100644
--- a/seqchromloader/writer.py
+++ b/seqchromloader/writer.py
@@ -26,8 +26,26 @@ def dump_data_webdataset(coords, genome_fasta, bigwig_filelist,
                         numProcessors=1,
                         transforms=None):
     """
-    Given coordinates dataframe, extract the sequence and chromatin signal,
-    Then save in **TFReocrd** format
+    Given coordinates dataframe, extract the sequence and chromatin signal, save in webdataset format
+
+    :param coords: pandas DataFrame containing genomic coordinates with columns **[chrom, start, end, label]**
+    :type coords: pandas DataFrame
+    :param genome_fasta: Genome fasta file.
+    :type genome_fasta: str
+    :param bigwig_filelist: A list of bigwig files containing track information (e.g., histone modifications)
+    :type bigwig_filelist: list of str or None
+    :param target_bam: bam file to get # reads in each region
+    :type target_bam: str or None
+    :param transforms: A dictionary of functions to transform the output data, accepted keys are *["seq", "chrom", "target", "label"]*
+    :type transforms: dict of functions
+    :param outdir: output directory to save files in
+    :type outdir: str
+    :param outprefix: prefix of output files
+    :type outprefix: str
+    :param compress: whether to compress the output files
+    :type compress: boolean
+    :param numProcessors: number of processors
+    :type numProcessors: int
     """
 
     # split coordinates and assign chunks to workers

From 35caefe1da81394038f54a67949674d32e1f35e1 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:15:08 -0400
Subject: [PATCH 2/8] replace sphinx theme

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ecbd354..7b3a726 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -60,7 +60,7 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

From c33c25e7a909c2a0cd59a245485fbb23543009a2 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:18:12 -0400
Subject: [PATCH 3/8] Update conf.py

---
 docs/source/conf.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7b3a726..ba030f3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -60,9 +60,19 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
+}
+intersphinx_disabled_domains = ['std']
+
+templates_path = ['_templates']
+
+# -- Options for HTML output
+
 html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
+# html_static_path = ['_static']
\ No newline at end of file

From 92d1f4b587cf2782e0f12b4af6d1a441767c0a0f Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:22:20 -0400
Subject: [PATCH 4/8] fix: mock imports

---
 docs/source/conf.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ba030f3..90cb5e0 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,13 +18,18 @@
 sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
 
 # import MOCK
-autodoc_mock_imports = ["torch",
-                        "pysam",
-                        "pybedtools",
-                        "pyfasta",
-                        "pyBigWig",
-                        "pytorch_lightning",
-                        "webdataset"]
+from unittest import mock
+
+# Mock open3d because it fails to build in readthedocs
+MOCK_MODULES = ["torch",
+                "pysam",
+                "pybedtools",
+                "pyfasta",
+                "pyBigWig",
+                "pytorch_lightning",
+                "webdataset"]
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = mock.Mock()
 
 # -- Project information -----------------------------------------------------
 
@@ -54,12 +59,6 @@
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3/', None),
     'sphinx': ('https://www.sphinx-doc.org/en/master/', None),

From a1489e3f4ee268573eecb566b6791b51ba7084bf Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:25:29 -0400
Subject: [PATCH 5/8] fix: mock imports

---
 docs/source/conf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 90cb5e0..eec861c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,8 +20,10 @@
 # import MOCK
 from unittest import mock
 
-# Mock open3d because it fails to build in readthedocs
-MOCK_MODULES = ["torch",
+# Mock imports
+MOCK_MODULES = ["numpy",
+                "pandas"
+                "torch",
                 "pysam",
                 "pybedtools",
                 "pyfasta",

From 20e6f2229bc227693c20e9186acd865b084823e2 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:27:02 -0400
Subject: [PATCH 6/8] Update conf.py

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index eec861c..3b44355 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,7 +22,7 @@
 
 # Mock imports
 MOCK_MODULES = ["numpy",
-                "pandas"
+                "pandas",
                 "torch",
                 "pysam",
                 "pybedtools",

From 32b113c4ef6188df4882b8db3e4516315400b4e3 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:28:14 -0400
Subject: [PATCH 7/8] Update conf.py

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3b44355..5497e24 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,6 +24,7 @@
 MOCK_MODULES = ["numpy",
                 "pandas",
                 "torch",
+                "torch.utils.data",
                 "pysam",
                 "pybedtools",
                 "pyfasta",

From 76ed44d50410ba84e1cdbbd13c7538aec57d3cc0 Mon Sep 17 00:00:00 2001
From: Jianyu Yang <yztxwd@gmail.com>
Date: Mon, 20 Mar 2023 22:47:21 -0400
Subject: [PATCH 8/8] Update conf.py

---
 docs/source/conf.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5497e24..16dd169 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -21,18 +21,16 @@
 from unittest import mock
 
 # Mock imports
-MOCK_MODULES = ["numpy",
-                "pandas",
-                "torch",
-                "torch.utils.data",
-                "pysam",
-                "pybedtools",
-                "pyfasta",
-                "pyBigWig",
-                "pytorch_lightning",
-                "webdataset"]
-for mod_name in MOCK_MODULES:
-    sys.modules[mod_name] = mock.Mock()
+autodoc_mock_imports = ["numpy",
+                        "pandas",
+                        "torch",
+                        "torch.utils.data",
+                        "pysam",
+                        "pybedtools",
+                        "pyfasta",
+                        "pyBigWig",
+                        "pytorch_lightning",
+                        "webdataset"]
 
 # -- Project information -----------------------------------------------------