MontgomeryLab · taimontgomery · Aug 4, 2022 · Jul 20, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/conda/conda-linux-64.lock b/conda/conda-linux-64.lock
diff --git a/conda/conda-osx-64.lock b/conda/conda-osx-64.lock
diff --git a/conda/conda-r-linux-64.lock b/conda/conda-r-linux-64.lock
diff --git a/conda/conda-r-osx-64.lock b/conda/conda-r-osx-64.lock
diff --git a/conda/environment.yml b/conda/environment.yml
@@ -3,21 +3,22 @@ channels:
   - bioconda
   - main
 dependencies:
-  - python>=3.7
-  - numpy==1.19.2
-  - pandas==1.2.4
-  - conda-forge::matplotlib==3.4.3
-  - fastp==0.22.0
-  - bowtie==1.2.3
-  - tbb==2020.3  # Bowtie dependency, temporary fix. v2021.x breaks bowtie, is not properly pinned to v2020 in bioconda
-  - nodejs==10.13.0
-  - conda-forge::cwltool==3.1.20211001174446
-  - conda-forge::psutil==5.8.0
+  - python>=3.9
+  - bioconductor-deseq2==1.34.0
+  - bowtie==1.3.1
+  - fastp==0.23.2
+  - htseq==2.0.2
+  - nodejs==16.13.1
+  - numpy==1.23.1
+  - pandas==1.4.3
+  - pip==22.1.2
+  - setuptools==63.1.0
+  - conda-forge::cwltool==3.1.20220628170238
+  - conda-forge::matplotlib==3.5.2
   - conda-forge::mscorefonts==0.0.1
+  - conda-forge::psutil==5.9.1
   - conda-forge::r-base==4.1.1
-  - setuptools==52.0.0
-  - pip==21.2.4
   - pip:
-    - --use-feature=in-tree-build
-    - htseq==0.13.5
-    - ../ # Install tinyRNA via setup.py
+    - ../ # Install tinyRNA via setup.py
+variables:
+  - PYTHONNOUSERSITE: 1
diff --git a/setup.py b/setup.py
@@ -7,13 +7,13 @@
 
 # Package metadata
 NAME = 'tinyrna'
-DESCRIPTION = 'Comprehensive analysis of small RNA high-throughput sequencing data'
+DESCRIPTION = 'Precision analysis of small RNA high-throughput sequencing data'
 URL = 'https://github.com/MontgomeryLab/tinyrna/'
 EMAIL = 'ajtate@colostate.edu'
 AUTHOR = 'Kristen Brown, Alex Tate'
 PLATFORM = 'Unix'
-REQUIRES_PYTHON = '>=3.7.0'
-VERSION = '0.1'
+REQUIRES_PYTHON = '>=3.9.0'
+VERSION = '1.0'
 
 # Required packages are installed via Conda's environment.yml
 # See PreFlight below...

diff --git a/setup.sh b/setup.sh
@@ -6,10 +6,8 @@
 
 [[ $# -eq 1 ]] && env_name=$1 || env_name="tinyrna"
 
-python_version="3.7"
+python_version="3.9"
 miniconda_version="4.12.0"
-bioc_version="3.14"
-tested_bioc_versions="3.1[2-4]"
 
 function success() {
   check="✓"
@@ -83,12 +81,12 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
   success "macOS detected"
   shell=$(basename "$(dscl . -read ~/ UserShell | cut -f 2 -d " ")")
   miniconda_installer="Miniconda3-py${python_version/./}_${miniconda_version}-MacOSX-x86_64.sh"
-  platform_lock_file="./conda/conda-r-osx-64.lock"
+  platform_lock_file="./conda/conda-osx-64.lock"
 elif [[ "$OSTYPE" == "linux-gnu" ]]; then
   success "Linux detected"
   shell="$(basename "$SHELL")"
   miniconda_installer="Miniconda3-py${python_version/./}_${miniconda_version}-Linux-x86_64.sh"
-  platform_lock_file="./conda/conda-r-linux-64.lock"
+  platform_lock_file="./conda/conda-linux-64.lock"
 else
   fail "Unsupported OS"
   exit 1
@@ -123,46 +121,6 @@ else
   rm $miniconda_installer
 fi
 
-# By default, assume that host environment does not contain an appropriate DESeq2 version
-install_R_deseq2=true
-
-# Check if R is installed
-if [ -x "$(command -v R)" ]; then
-  status "Checking host R environment..."
-  # Check if DESeq2 is installed
-  if Rscript -e "library(DESeq2); print(TRUE)" 2>&1 | tail -n 1 | grep -q TRUE; then
-    # Get installed Bioconductor version
-    host_bioc_vers=$(Rscript -e "library(BiocManager); BiocManager::version()" 2>&1 | tail -n 1 | grep -Eo '[0-9]+\.[0-9]+')
-    # Check to see if host_bioc_version is in our tested range
-    if [[ $host_bioc_vers =~ $tested_bioc_versions ]]; then
-      success "DESeq2 is already installed in the host environment"
-      install_R_deseq2=false
-    else
-      echo
-      echo "tinyRNA has been tested with DESeq2 in Bioconductor release $tested_bioc_versions." \
-      "The installer found v$host_bioc_vers on your system. tinyRNA can use your copy or we can" \
-      "install a tested version of DESeq2 and R in the isolated tinyRNA environment." | fold -s
-      echo
-      echo "BEWARE: installation of DESeq2 will take over 20 minutes."
-      echo
-      read -p "Would you like tinyRNA to use your copy of DESeq2? [y/n]: " -n 1 -r
-      echo
-      if [[ $REPLY =~ ^[Yy]$ ]]; then
-        success "The host's DESeq2 installation will be used"
-        install_R_deseq2=false
-      elif [[ $REPLY =~ ^[^YyNn]$ ]]; then
-        fail "Invalid option: $REPLY"
-        exit 1
-      fi
-    fi # End of Bioconductor version check
-  fi # End DESeq2 check
-fi # End of R check
-
-if [[ $install_R_deseq2 == false ]]; then
-  # Switch to using non-R lock file
-  platform_lock_file="${platform_lock_file//-r/}"
-fi
-
 # Check if the conda environment $env_name exists
 if conda env list | grep -q "$env_name"; then
   echo
@@ -200,34 +158,18 @@ else
   setup_environment
 fi
 
-# Activate tinyRNA environment
+# Activate environment and set environment variable config for Linux stability
 conda activate $env_name
+conda env config vars set PYTHONNOUSERSITE=1 > /dev/null  # FYI: cannot be set by lockfile
 
-# Install pip dependencies and our codebase
-status "Installing pip dependencies..."
-if ! pip --use-feature=in-tree-build install htseq==0.13.5 . > "pip_install.log" 2>&1; then
-  fail "Failed to install pip dependencies"
+# Install the tinyRNA codebase
+status "Installing tinyRNA codebase via pip..."
+if ! pip install . > "pip_install.log" 2>&1; then
+  fail "Failed to install tinyRNA codebase"
   echo "Check the pip_install.log file for more information."
   exit 1
 fi
-success "pip dependencies installed"
-
-if [[ $install_R_deseq2 == true ]]; then
-  # Install DESeq2 from Bioconductor
-  status "Installing DESeq2 from Bioconductor (this may take over 20 minutes)..."
-  status 'To check status run "tail -f deseq2_install.log" from another terminal'
-  Rscript -e "install.packages(\"BiocManager\", version=\"$bioc_version\", repos=\"https://cloud.r-project.org\")" > "deseq2_install.log" 2>&1
-  Rscript -e "BiocManager::install(\"DESeq2\", version=\"$bioc_version\")" >> "deseq2_install.log" 2>&1
-
-  # Check if DESeq2 installation was successful
-  if grep -q "DONE (DESeq2)" "deseq2_install.log"; then
-    success "DESeq2 installation was successful"
-  else
-    fail "DESeq2 installation failed"
-    echo "See deseq2_install.log for more information"
-    exit 1
-  fi
-fi
+success "tinyRNA codebase installed"
 
 success "Setup complete"
 status "To activate the environment, run:"

diff --git a/tiny/cwl/tools/bowtie.cwl b/tiny/cwl/tools/bowtie.cwl
@@ -17,6 +17,7 @@ inputs:
   ebwt:
     type: string
     inputBinding:
+      prefix: -x
       position: 23
     doc: "The basename of the index to be searched."
 
@@ -28,9 +29,8 @@ inputs:
   reads:
     type: File
     inputBinding:
-      itemSeparator: ","
       position: 24
-    doc: "Comma-separated list of files containing unpaired reads"
+    doc: "File containing unpaired reads"
 
   outfile:
     type: string

diff --git a/tiny/entry.py b/tiny/entry.py
@@ -97,7 +97,10 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None:
 
     if config_object['run_native']:  # experimental
         # Execute the CWL runner via native Python
-        return_code = run_native(config_object, workflow, run_directory, verbosity=loudness)
+        return_code = run_native(
+            config_object, workflow,
+            run_directory=run_directory,
+            parallel=parallel, verbosity=loudness)
     else:
         # Use the cwltool CWL runner via command line
         return_code = run_cwltool_subprocess(
@@ -176,7 +179,7 @@ def run_cwltool_subprocess(config_file: str, workflow: str, run_directory=None,
     """
 
     command = ['cwltool --timestamps --relax-path-checks --on-error continue']
-    if verbosity == 'debug': command.append('--debug --js-console')
+    if verbosity == 'debug': command.append('--debug --js-console --leave-tmpdir')
     if verbosity == 'quiet': command.append('--quiet')
     if run_directory: command.append(f'--outdir {run_directory}')
     if parallel: command.append('--parallel')
@@ -185,7 +188,7 @@ def run_cwltool_subprocess(config_file: str, workflow: str, run_directory=None,
     return subprocess.run(cwl_runner, shell=True).returncode
 
 
-def run_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.', verbosity="normal") -> int:
+def run_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.', parallel=False, verbosity="normal") -> int:
     """Executes the workflow using native Python rather than subprocess "command line"
 
     Args:
@@ -238,14 +241,14 @@ def furnish_if_file_record(file_dict):
                         datefmt="%Y-%m-%d %H:%M:%S", level=level, isatty=True)
 
     # Create a wrapper for the executors so that we may pass our logger to them (unsupported by Factory)
-    parallel: MultithreadedJobExecutor = functools.partial(MultithreadedJobExecutor(), logger=logger)
-    serial: SingleJobExecutor = functools.partial(SingleJobExecutor(), logger=logger)
+    parallel_exec: MultithreadedJobExecutor = functools.partial(MultithreadedJobExecutor(), logger=logger)
+    serial_exec: SingleJobExecutor = functools.partial(SingleJobExecutor(), logger=logger)
 
     # Instantiate Factory with our run preferences
     cwl = cwltool.factory.Factory(
         runtime_context=runtime_context,
         loading_context=LoadingContext({'relax_path_checks': True}),
-        executor=parallel if parallel else serial
+        executor=parallel_exec if parallel else serial_exec
     )
 
     try:

diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
@@ -163,6 +163,7 @@ def map_and_reduce(libraries, prefs):
 
     # Use a multiprocessing pool if multiple sam files were provided
     if len(libraries) > 1:
+        mp.set_start_method("fork")
         with mp.Pool(len(libraries)) as pool:
             async_results = pool.imap_unordered(counter.count_reads, libraries)
 

diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
@@ -13,20 +13,17 @@
 feature_record_tuple = Tuple[str, str, Tuple[match_tuple]]  # (feature ID, strand, match tuple)
 
 
-class Features:
+class Features(metaclass=Singleton):
     chrom_vectors: HTSeq.ChromVector
-    classes: dict
     aliases: dict
+    classes: dict
+    tags: dict
 
-    _instance = None  # Singleton
-
-    def __init__(self, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict):
-        if Features._instance is None:
-            Features.chrom_vectors = features.chrom_vectors  # For interval -> feature record tuple lookups
-            Features.aliases = aliases                       # For feature ID -> preferred feature name lookups
-            Features.classes = classes                       # For feature ID -> class lookups
-            Features.tags = tags                             # For feature ID -> match IDs
-            Features._instance = self
+    def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict):
+        Features.chrom_vectors = features.chrom_vectors  # For interval -> feature record tuple lookups
+        Features.aliases = aliases                       # For feature ID -> preferred feature name lookups
+        Features.classes = classes                       # For feature ID -> class lookups
+        Features.tags = tags                             # For feature ID -> match IDs
 
 
 class FeatureCounter:

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
@@ -172,7 +172,7 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str:
     else: return "non-reverse"
 
 
-def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
+def parse_GFF_attribute_string(attrStr, extra_return_first_value=False, gff_version=2):
     """Parses a GFF attribute string and returns it as a dictionary.
 
     This is a slight modification of the same method found in HTSeq.features.
@@ -186,25 +186,35 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
         ID."
     """
 
+    if attrStr.endswith("\n"):
+        attrStr = attrStr[:-1]
+
     # Modification: store attributes in a dict subclass that allows case-insensitive ops
     attribute_dict = CaseInsensitiveAttrs()
     first_val = "_unnamed_"
-    for i, attr in enumerate(HTSeq._HTSeq.quotesafe_split(attrStr.rstrip().encode())):
+
+    if gff_version == 2:
+        iterator = HTSeq._HTSeq.quotesafe_split(attrStr.encode())
+    else:
+        # GFF3 does not care about quotes
+        iterator = attrStr.encode().split(b';')
+
+    for i, attr in enumerate(iterator):
         attr = attr.decode()
         if _re_attr_empty.match(attr):
             continue
-        if attr.count('"') not in (0, 2):
+        if (gff_version == 2) and attr.count('"') not in (0, 2):
             raise ValueError(
                 "The attribute string seems to contain mismatched  quotes.")
         mo = _re_attr_main.match(attr)
         if not mo:
             raise ValueError("Failure parsing GFF attribute line")
         key = mo.group(1)
         val = mo.group(2)
-        if val.startswith('"') and val.endswith('"'):
+        if (gff_version == 2) and val.startswith('"') and val.endswith('"'):
             val = val[1:-1]
         # Modification: allow for comma separated attribute values
-        attribute_dict[sys.intern(key)] = (sys.intern(val),) \
+        attribute_dict[key] = (val,) \
             if ',' not in val \
             else tuple(c.strip() for c in val.split(','))
         if extra_return_first_value and i == 0:
@@ -350,7 +360,7 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **kwargs):
         self.tags = defaultdict(set)                                    # Root Feature ID -> Root Match ID
 
         # Patch the GFF attribute parser to support comma separated attribute value lists
-        setattr(HTSeq.features, 'parse_GFF_attribute_string', parse_GFF_attribute_string)
+        setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string))
 
     @report_execution_time("GFF parsing")
     def get(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]:

diff --git a/tiny/rna/counter/matching.py b/tiny/rna/counter/matching.py
@@ -184,7 +184,7 @@ def __contains__(self, alignment: dict):
             terminus of this feature's interval.
         """
 
-        if alignment['strand'] is '+':
+        if alignment['strand'] == '+':
             return alignment['start'] == self.start
         else:
             return alignment['end'] == self.end
@@ -221,7 +221,7 @@ def __contains__(self, alignment):
             terminus of this feature's interval.
         """
 
-        if alignment["strand"] is '+':
+        if alignment["strand"] == '+':
             return alignment['end'] == self.end
         else:
             return alignment['start'] == self.start

diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py
@@ -248,9 +248,10 @@ def get_sample_rep_dict(df: pd.DataFrame) -> dict:
     """
 
     sample_dict = defaultdict(list)
+    non_numeric_cols = ["Feature Class", "Feature Name"]
 
     for col in df.columns:
-        if col == "Feature Class": continue
+        if col in non_numeric_cols: continue
         sample = col.split("_rep_")[0]
         sample_dict[sample].append(col)
 
@@ -668,6 +669,7 @@ def main():
         itinerary.append((func, arg, kwd))
 
     if len(itinerary) > 1 and not aqplt.is_debug_mode():
+        mp.set_start_method('fork')
         with mp.Pool(len(itinerary)) as pool:
             results = []
             for task, args, kwds in itinerary:

diff --git a/tiny/rna/plotterlib.py b/tiny/rna/plotterlib.py
@@ -272,7 +272,7 @@ def get_scatter_view_lims(counts_df: pd.DataFrame) -> Tuple[float, float]:
         """Calculates scatter view limits for the counts dataframe"""
 
         x0 = counts_df.min(axis='columns').where(lambda x: x != 0).dropna().min()
-        x1 = np.max(counts_df).max()
+        x1 = counts_df.max().max()
         minpos = 1e-300
 
         if not np.isfinite([x0, x1]).all() or not isinstance(x0, np.float) or x1 <= 0:
@@ -382,7 +382,7 @@ def every_nth_label(self, axis: mpl.axis.Axis, n: int) -> Tuple[List[mpl.axis.Ti
 
         # If the last tick label on the x-axis will extend past the plot space,
         # then hide it and its corresponding tick on the y-axis
-        if axis.__name__ is "xaxis" and axis.get_tick_space() == len(ticks_displayed):
+        if axis.__name__ == "xaxis" and axis.get_tick_space() == len(ticks_displayed):
             major_ticks[last_idx].label1.set_visible(False)
             yaxis = axis.axes.yaxis
             yaxis.get_major_ticks()[last_idx].label1.set_visible(False)
@@ -421,7 +421,7 @@ def cache_ticks(self, axis: mpl.axis.Axis, name: str):
     def restore_ticks(self, ax: plt.Axes, axis: str):
         """Restore tick objects from previous render"""
 
-        axes = [ax.xaxis, ax.yaxis] if axis is "both" else [getattr(ax, axis)]
+        axes = [ax.xaxis, ax.yaxis] if axis == "both" else [getattr(ax, axis)]
         for axis in axes:
             name = axis.__name__
             for type in ["major", "minor"]: