Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions tests/unit_tests_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from tiny.rna.configuration import Configuration, SamplesSheet, PathsFile
from unit_test_helpers import csv_factory, paths_template_file, make_paths_file
from tiny.rna.util import r_reserved_keywords


class BowtieIndexesTest(unittest.TestCase):
Expand Down Expand Up @@ -209,6 +210,22 @@ def test_validate_fastq_filepath(self):
patch('tiny.rna.configuration.os.path.isfile', return_value=True):
SamplesSheet('mock_filename')

"""Does validate_r_safe_sample_groups detect group names that will cause namespace collisions in R?"""

def test_validate_r_safe_sample_groups(self):
non_alphanum_chars = [bad.join(('a', 'b')) for bad in "~!@#$%^&*()+-=`<>?/,:;\"'[]{}\| \t\n\r\f\v"]
leading_dot_number = [".0", "X.0"]

for bad in [non_alphanum_chars, leading_dot_number]:
msg = " ≈ ".join(bad)
with self.assertRaisesRegex(AssertionError, msg):
SamplesSheet.validate_r_safe_sample_groups(dict.fromkeys(bad))

for kwd in r_reserved_keywords:
bad = (kwd, kwd + '.')
msg = " ≈ ".join(bad)
with self.assertRaisesRegex(AssertionError, msg):
SamplesSheet.validate_r_safe_sample_groups(dict.fromkeys(bad))

class PathsFileTest(unittest.TestCase):

Expand Down
21 changes: 19 additions & 2 deletions tiny/rna/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import re

from pkg_resources import resource_filename
from collections import Counter, OrderedDict
from collections import Counter, OrderedDict, defaultdict
from typing import Union, Any, Optional, List
from glob import glob

from tiny.rna.counter.validation import GFFValidator
from tiny.rna.util import get_timestamp
from tiny.rna.util import get_timestamp, get_r_safename


class ConfigBase:
Expand Down Expand Up @@ -586,6 +586,8 @@ def validate_control_group(self, is_control: bool, group: str):

@staticmethod
def validate_deseq_compatibility(sample_groups: Counter) -> bool:
SamplesSheet.validate_r_safe_sample_groups(sample_groups)

total_samples = sum(sample_groups.values())
total_coefficients = len(sample_groups)
degrees_of_freedom = total_samples - total_coefficients
Expand All @@ -598,6 +600,21 @@ def validate_deseq_compatibility(sample_groups: Counter) -> bool:
else:
return True

@staticmethod
def validate_r_safe_sample_groups(sample_groups: Counter):
"""Determine the "syntactically valid" translation of each group name to ensure
that two groups won't share the same name once translated in tiny-deseq.r"""

safe_names = defaultdict(list)
for group in sample_groups:
safe_names[get_r_safename(group)].append(group)

collisions = [' ≈ '.join(cluster) for cluster in safe_names.values() if len(cluster) > 1]

assert len(collisions) == 0, \
"The following group names are too similar and will cause a namespace collision in R:\n" \
+ '\n'.join(collisions)

@staticmethod
def get_sample_basename(filename):
root, _ = os.path.splitext(filename)
Expand Down
24 changes: 20 additions & 4 deletions tiny/rna/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,33 @@ def make_filename(args, ext='.csv'):
return '_'.join([str(chnk) for chnk in args if chnk is not None]) + ext


r_reserved_keywords = [
"if", "else", "repeat", "while", "function",
"for", "in", "next", "break", "TRUE", "FALSE",
"NULL", "Inf", "NaN", "NA", "NA_integer_",
"NA_real_", "NA_complex_", "NA_character_"]


def get_r_safename(name: str) -> str:
"""Converts a string to a syntactically valid R name

This can be used to match names along axes of DataFrames produced by R,
assuming that the R script takes no measures to preserve names itself.
This can be used as the Python equivalent of R's make.names() function.
https://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html
"""

# If the name starts with a non-letter character or a dot
# followed by a number, the character "X" is prepended
leading_char = lambda x: re.sub(r"^(?=[^a-zA-Z.]+|\.\d)", "X", x)
special_char = lambda x: re.sub(r"[^a-zA-Z0-9_.]", ".", x)
return special_char(leading_char(name))

# If the name contains characters that aren't (locale based) letters,
# numbers, dot, or underscore, the characters are replaced with a dot
special_char = lambda x: re.sub(r"[^\w.]", ".", x)

# If the name contains R keywords, a dot is appended to the keyword
reserved = "|".join(r_reserved_keywords)
reserved_wrd = lambda x: re.sub(fr"^({reserved})$", r'\1.', x)

return reserved_wrd(special_char(leading_char(name)))


class ReadOnlyDict(dict):
Expand Down