Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
9475779
Dependencies and Python version have been upgraded for 3.9. This repr…
AlexTate Jul 20, 2022
04d451a
Update multiprocessing utilities to use fork instead of spawn. MP for…
AlexTate Jul 20, 2022
8605d55
Updated our parse_GFF_attribute_string() patch to reflect latest chan…
AlexTate Jul 20, 2022
1cf7b48
Python now issues a warning when comparing to string literals using t…
AlexTate Jul 20, 2022
e6cb8de
Minor change to fix a future deprication warning from numpy
AlexTate Jul 20, 2022
0fd4f81
Minor change to make the cwltool runner more useful when running the …
AlexTate Jul 20, 2022
2f70e27
The Features class now uses the improved singleton design pattern fir…
AlexTate Jul 20, 2022
7687071
Settling on a version compromise for r-base and nodejs (newer version…
AlexTate Jul 20, 2022
d022c6b
setup.sh has been updated to no longer install r or DESeq2 since this…
AlexTate Jul 20, 2022
befadb5
Removing old lockfiles for the r-included installs. Remaining lockfil…
AlexTate Jul 20, 2022
4d5b962
Finalizing conda lock files. environment.yml has also been updated to…
AlexTate Jul 26, 2022
ef7521a
Small bugfix to address a non-numeric reduction warning
AlexTate Jul 26, 2022
b3961e6
Package version corrections in setup.py
AlexTate Jul 26, 2022
855daec
Directing stdout from `conda env config vars` to /dev/null to keep te…
AlexTate Jul 26, 2022
1112785
Updating project description in setup.py
AlexTate Jul 26, 2022
6da1e4f
Version bugfix for the bowtie CWL wrapper. The previous method of set…
AlexTate Jul 27, 2022
9073081
The run_native runner mode was performing parallel runs regardless of…
AlexTate Jul 27, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
405 changes: 271 additions & 134 deletions conda/conda-linux-64.lock

Large diffs are not rendered by default.

383 changes: 261 additions & 122 deletions conda/conda-osx-64.lock

Large diffs are not rendered by default.

219 changes: 0 additions & 219 deletions conda/conda-r-linux-64.lock

This file was deleted.

191 changes: 0 additions & 191 deletions conda/conda-r-osx-64.lock

This file was deleted.

31 changes: 16 additions & 15 deletions conda/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@ channels:
- bioconda
- main
dependencies:
- python>=3.7
- numpy==1.19.2
- pandas==1.2.4
- conda-forge::matplotlib==3.4.3
- fastp==0.22.0
- bowtie==1.2.3
- tbb==2020.3 # Bowtie dependency, temporary fix. v2021.x breaks bowtie, is not properly pinned to v2020 in bioconda
- nodejs==10.13.0
- conda-forge::cwltool==3.1.20211001174446
- conda-forge::psutil==5.8.0
- python>=3.9
- bioconductor-deseq2==1.34.0
- bowtie==1.3.1
- fastp==0.23.2
- htseq==2.0.2
- nodejs==16.13.1
- numpy==1.23.1
- pandas==1.4.3
- pip==22.1.2
- setuptools==63.1.0
- conda-forge::cwltool==3.1.20220628170238
- conda-forge::matplotlib==3.5.2
- conda-forge::mscorefonts==0.0.1
- conda-forge::psutil==5.9.1
- conda-forge::r-base==4.1.1
- setuptools==52.0.0
- pip==21.2.4
- pip:
- --use-feature=in-tree-build
- htseq==0.13.5
- ../ # Install tinyRNA via setup.py
- ../ # Install tinyRNA via setup.py
variables:
- PYTHONNOUSERSITE: 1
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

# Package metadata
NAME = 'tinyrna'
DESCRIPTION = 'Comprehensive analysis of small RNA high-throughput sequencing data'
DESCRIPTION = 'Precision analysis of small RNA high-throughput sequencing data'
URL = 'https://github.com/MontgomeryLab/tinyrna/'
EMAIL = 'ajtate@colostate.edu'
AUTHOR = 'Kristen Brown, Alex Tate'
PLATFORM = 'Unix'
REQUIRES_PYTHON = '>=3.7.0'
VERSION = '0.1'
REQUIRES_PYTHON = '>=3.9.0'
VERSION = '1.0'

# Required packages are installed via Conda's environment.yml
# See PreFlight below...
Expand Down
78 changes: 10 additions & 68 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@

[[ $# -eq 1 ]] && env_name=$1 || env_name="tinyrna"

python_version="3.7"
python_version="3.9"
miniconda_version="4.12.0"
bioc_version="3.14"
tested_bioc_versions="3.1[2-4]"

function success() {
check="✓"
Expand Down Expand Up @@ -83,12 +81,12 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
success "macOS detected"
shell=$(basename "$(dscl . -read ~/ UserShell | cut -f 2 -d " ")")
miniconda_installer="Miniconda3-py${python_version/./}_${miniconda_version}-MacOSX-x86_64.sh"
platform_lock_file="./conda/conda-r-osx-64.lock"
platform_lock_file="./conda/conda-osx-64.lock"
elif [[ "$OSTYPE" == "linux-gnu" ]]; then
success "Linux detected"
shell="$(basename "$SHELL")"
miniconda_installer="Miniconda3-py${python_version/./}_${miniconda_version}-Linux-x86_64.sh"
platform_lock_file="./conda/conda-r-linux-64.lock"
platform_lock_file="./conda/conda-linux-64.lock"
else
fail "Unsupported OS"
exit 1
Expand Down Expand Up @@ -123,46 +121,6 @@ else
rm $miniconda_installer
fi

# By default, assume that host environment does not contain an appropriate DESeq2 version
install_R_deseq2=true

# Check if R is installed
if [ -x "$(command -v R)" ]; then
status "Checking host R environment..."
# Check if DESeq2 is installed
if Rscript -e "library(DESeq2); print(TRUE)" 2>&1 | tail -n 1 | grep -q TRUE; then
# Get installed Bioconductor version
host_bioc_vers=$(Rscript -e "library(BiocManager); BiocManager::version()" 2>&1 | tail -n 1 | grep -Eo '[0-9]+\.[0-9]+')
# Check to see if host_bioc_version is in our tested range
if [[ $host_bioc_vers =~ $tested_bioc_versions ]]; then
success "DESeq2 is already installed in the host environment"
install_R_deseq2=false
else
echo
echo "tinyRNA has been tested with DESeq2 in Bioconductor release $tested_bioc_versions." \
"The installer found v$host_bioc_vers on your system. tinyRNA can use your copy or we can" \
"install a tested version of DESeq2 and R in the isolated tinyRNA environment." | fold -s
echo
echo "BEWARE: installation of DESeq2 will take over 20 minutes."
echo
read -p "Would you like tinyRNA to use your copy of DESeq2? [y/n]: " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
success "The host's DESeq2 installation will be used"
install_R_deseq2=false
elif [[ $REPLY =~ ^[^YyNn]$ ]]; then
fail "Invalid option: $REPLY"
exit 1
fi
fi # End of Bioconductor version check
fi # End DESeq2 check
fi # End of R check

if [[ $install_R_deseq2 == false ]]; then
# Switch to using non-R lock file
platform_lock_file="${platform_lock_file//-r/}"
fi

# Check if the conda environment $env_name exists
if conda env list | grep -q "$env_name"; then
echo
Expand Down Expand Up @@ -200,34 +158,18 @@ else
setup_environment
fi

# Activate tinyRNA environment
# Activate environment and set environment variable config for Linux stability
conda activate $env_name
conda env config vars set PYTHONNOUSERSITE=1 > /dev/null # FYI: cannot be set by lockfile

# Install pip dependencies and our codebase
status "Installing pip dependencies..."
if ! pip --use-feature=in-tree-build install htseq==0.13.5 . > "pip_install.log" 2>&1; then
fail "Failed to install pip dependencies"
# Install the tinyRNA codebase
status "Installing tinyRNA codebase via pip..."
if ! pip install . > "pip_install.log" 2>&1; then
fail "Failed to install tinyRNA codebase"
echo "Check the pip_install.log file for more information."
exit 1
fi
success "pip dependencies installed"

if [[ $install_R_deseq2 == true ]]; then
# Install DESeq2 from Bioconductor
status "Installing DESeq2 from Bioconductor (this may take over 20 minutes)..."
status 'To check status run "tail -f deseq2_install.log" from another terminal'
Rscript -e "install.packages(\"BiocManager\", version=\"$bioc_version\", repos=\"https://cloud.r-project.org\")" > "deseq2_install.log" 2>&1
Rscript -e "BiocManager::install(\"DESeq2\", version=\"$bioc_version\")" >> "deseq2_install.log" 2>&1

# Check if DESeq2 installation was successful
if grep -q "DONE (DESeq2)" "deseq2_install.log"; then
success "DESeq2 installation was successful"
else
fail "DESeq2 installation failed"
echo "See deseq2_install.log for more information"
exit 1
fi
fi
success "tinyRNA codebase installed"

success "Setup complete"
status "To activate the environment, run:"
Expand Down
4 changes: 2 additions & 2 deletions tiny/cwl/tools/bowtie.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ inputs:
ebwt:
type: string
inputBinding:
prefix: -x
position: 23
doc: "The basename of the index to be searched."

Expand All @@ -28,9 +29,8 @@ inputs:
reads:
type: File
inputBinding:
itemSeparator: ","
position: 24
doc: "Comma-separated list of files containing unpaired reads"
doc: "File containing unpaired reads"

outfile:
type: string
Expand Down
15 changes: 9 additions & 6 deletions tiny/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None:

if config_object['run_native']: # experimental
# Execute the CWL runner via native Python
return_code = run_native(config_object, workflow, run_directory, verbosity=loudness)
return_code = run_native(
config_object, workflow,
run_directory=run_directory,
parallel=parallel, verbosity=loudness)
else:
# Use the cwltool CWL runner via command line
return_code = run_cwltool_subprocess(
Expand Down Expand Up @@ -176,7 +179,7 @@ def run_cwltool_subprocess(config_file: str, workflow: str, run_directory=None,
"""

command = ['cwltool --timestamps --relax-path-checks --on-error continue']
if verbosity == 'debug': command.append('--debug --js-console')
if verbosity == 'debug': command.append('--debug --js-console --leave-tmpdir')
if verbosity == 'quiet': command.append('--quiet')
if run_directory: command.append(f'--outdir {run_directory}')
if parallel: command.append('--parallel')
Expand All @@ -185,7 +188,7 @@ def run_cwltool_subprocess(config_file: str, workflow: str, run_directory=None,
return subprocess.run(cwl_runner, shell=True).returncode


def run_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.', verbosity="normal") -> int:
def run_native(config_object: 'ConfigBase', workflow: str, run_directory: str = '.', parallel=False, verbosity="normal") -> int:
"""Executes the workflow using native Python rather than subprocess "command line"

Args:
Expand Down Expand Up @@ -238,14 +241,14 @@ def furnish_if_file_record(file_dict):
datefmt="%Y-%m-%d %H:%M:%S", level=level, isatty=True)

# Create a wrapper for the executors so that we may pass our logger to them (unsupported by Factory)
parallel: MultithreadedJobExecutor = functools.partial(MultithreadedJobExecutor(), logger=logger)
serial: SingleJobExecutor = functools.partial(SingleJobExecutor(), logger=logger)
parallel_exec: MultithreadedJobExecutor = functools.partial(MultithreadedJobExecutor(), logger=logger)
serial_exec: SingleJobExecutor = functools.partial(SingleJobExecutor(), logger=logger)

# Instantiate Factory with our run preferences
cwl = cwltool.factory.Factory(
runtime_context=runtime_context,
loading_context=LoadingContext({'relax_path_checks': True}),
executor=parallel if parallel else serial
executor=parallel_exec if parallel else serial_exec
)

try:
Expand Down
1 change: 1 addition & 0 deletions tiny/rna/counter/counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def map_and_reduce(libraries, prefs):

# Use a multiprocessing pool if multiple sam files were provided
if len(libraries) > 1:
mp.set_start_method("fork")
with mp.Pool(len(libraries)) as pool:
async_results = pool.imap_unordered(counter.count_reads, libraries)

Expand Down
19 changes: 8 additions & 11 deletions tiny/rna/counter/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,17 @@
feature_record_tuple = Tuple[str, str, Tuple[match_tuple]] # (feature ID, strand, match tuple)


class Features:
class Features(metaclass=Singleton):
chrom_vectors: HTSeq.ChromVector
classes: dict
aliases: dict
classes: dict
tags: dict

_instance = None # Singleton

def __init__(self, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict):
if Features._instance is None:
Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups
Features.aliases = aliases # For feature ID -> preferred feature name lookups
Features.classes = classes # For feature ID -> class lookups
Features.tags = tags # For feature ID -> match IDs
Features._instance = self
def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict):
Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups
Features.aliases = aliases # For feature ID -> preferred feature name lookups
Features.classes = classes # For feature ID -> class lookups
Features.tags = tags # For feature ID -> match IDs


class FeatureCounter:
Expand Down
22 changes: 16 additions & 6 deletions tiny/rna/counter/hts_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str:
else: return "non-reverse"


def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
def parse_GFF_attribute_string(attrStr, extra_return_first_value=False, gff_version=2):
"""Parses a GFF attribute string and returns it as a dictionary.

This is a slight modification of the same method found in HTSeq.features.
Expand All @@ -186,25 +186,35 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
ID."
"""

if attrStr.endswith("\n"):
attrStr = attrStr[:-1]

# Modification: store attributes in a dict subclass that allows case-insensitive ops
attribute_dict = CaseInsensitiveAttrs()
first_val = "_unnamed_"
for i, attr in enumerate(HTSeq._HTSeq.quotesafe_split(attrStr.rstrip().encode())):

if gff_version == 2:
iterator = HTSeq._HTSeq.quotesafe_split(attrStr.encode())
else:
# GFF3 does not care about quotes
iterator = attrStr.encode().split(b';')

for i, attr in enumerate(iterator):
attr = attr.decode()
if _re_attr_empty.match(attr):
continue
if attr.count('"') not in (0, 2):
if (gff_version == 2) and attr.count('"') not in (0, 2):
raise ValueError(
"The attribute string seems to contain mismatched quotes.")
mo = _re_attr_main.match(attr)
if not mo:
raise ValueError("Failure parsing GFF attribute line")
key = mo.group(1)
val = mo.group(2)
if val.startswith('"') and val.endswith('"'):
if (gff_version == 2) and val.startswith('"') and val.endswith('"'):
val = val[1:-1]
# Modification: allow for comma separated attribute values
attribute_dict[sys.intern(key)] = (sys.intern(val),) \
attribute_dict[key] = (val,) \
if ',' not in val \
else tuple(c.strip() for c in val.split(','))
if extra_return_first_value and i == 0:
Expand Down Expand Up @@ -350,7 +360,7 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **kwargs):
self.tags = defaultdict(set) # Root Feature ID -> Root Match ID

# Patch the GFF attribute parser to support comma separated attribute value lists
setattr(HTSeq.features, 'parse_GFF_attribute_string', parse_GFF_attribute_string)
setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string))

@report_execution_time("GFF parsing")
def get(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]:
Expand Down
4 changes: 2 additions & 2 deletions tiny/rna/counter/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def __contains__(self, alignment: dict):
terminus of this feature's interval.
"""

if alignment['strand'] is '+':
if alignment['strand'] == '+':
return alignment['start'] == self.start
else:
return alignment['end'] == self.end
Expand Down Expand Up @@ -221,7 +221,7 @@ def __contains__(self, alignment):
terminus of this feature's interval.
"""

if alignment["strand"] is '+':
if alignment["strand"] == '+':
return alignment['end'] == self.end
else:
return alignment['start'] == self.start
Expand Down
4 changes: 3 additions & 1 deletion tiny/rna/plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,10 @@ def get_sample_rep_dict(df: pd.DataFrame) -> dict:
"""

sample_dict = defaultdict(list)
non_numeric_cols = ["Feature Class", "Feature Name"]

for col in df.columns:
if col == "Feature Class": continue
if col in non_numeric_cols: continue
sample = col.split("_rep_")[0]
sample_dict[sample].append(col)

Expand Down Expand Up @@ -668,6 +669,7 @@ def main():
itinerary.append((func, arg, kwd))

if len(itinerary) > 1 and not aqplt.is_debug_mode():
mp.set_start_method('fork')
with mp.Pool(len(itinerary)) as pool:
results = []
for task, args, kwds in itinerary:
Expand Down
6 changes: 3 additions & 3 deletions tiny/rna/plotterlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def get_scatter_view_lims(counts_df: pd.DataFrame) -> Tuple[float, float]:
"""Calculates scatter view limits for the counts dataframe"""

x0 = counts_df.min(axis='columns').where(lambda x: x != 0).dropna().min()
x1 = np.max(counts_df).max()
x1 = counts_df.max().max()
minpos = 1e-300

if not np.isfinite([x0, x1]).all() or not isinstance(x0, np.float) or x1 <= 0:
Expand Down Expand Up @@ -382,7 +382,7 @@ def every_nth_label(self, axis: mpl.axis.Axis, n: int) -> Tuple[List[mpl.axis.Ti

# If the last tick label on the x-axis will extend past the plot space,
# then hide it and its corresponding tick on the y-axis
if axis.__name__ is "xaxis" and axis.get_tick_space() == len(ticks_displayed):
if axis.__name__ == "xaxis" and axis.get_tick_space() == len(ticks_displayed):
major_ticks[last_idx].label1.set_visible(False)
yaxis = axis.axes.yaxis
yaxis.get_major_ticks()[last_idx].label1.set_visible(False)
Expand Down Expand Up @@ -421,7 +421,7 @@ def cache_ticks(self, axis: mpl.axis.Axis, name: str):
def restore_ticks(self, ax: plt.Axes, axis: str):
"""Restore tick objects from previous render"""

axes = [ax.xaxis, ax.yaxis] if axis is "both" else [getattr(ax, axis)]
axes = [ax.xaxis, ax.yaxis] if axis == "both" else [getattr(ax, axis)]
for axis in axes:
name = axis.__name__
for type in ["major", "minor"]:
Expand Down