diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 06dabff..65cca76 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,11 @@ Change Log .. There should always be an "Unreleased" section for changes pending release. +[0.6.0] - 2020-08-27 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Add support for multiline annotations for lines prefixed with single-line comment signs ("#") + [0.5.1] - 2020-08-25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py index cfbe56a..8538025 100644 --- a/code_annotations/__init__.py +++ b/code_annotations/__init__.py @@ -2,4 +2,4 @@ Extensible tools for parsing annotations in codebases. """ -__version__ = '0.5.1' +__version__ = '0.6.0' diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py index 69fe36d..9a45281 100644 --- a/code_annotations/extensions/base.py +++ b/code_annotations/extensions/base.py @@ -42,22 +42,27 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta): # Javascript and Python extensions for examples. lang_comment_definition = None - r""" - This format string/regex finds all comments in the file. The format tokens will be replaced with the - language-specific comment definitions defined in the sub-classes. - - {multi_start} - start of the language-specific multi-line comment (ex. /*) - ([\d\D]*?) - capture all of the characters... - {multi_end} - until you find the end of the language-specific multi-line comment (ex. */) - | - If you don't find any of those... - {single} - start by finding the single-line comment token (ex. //) - (.*) - and capture all characters until the end of the line - - Returns a 2-tuple of: - - ("Comment text", None) in the case of a multi-line comment OR - - (None, "Comment text") in the case of a single-line comment + # This format string/regex finds all comments in the file. The format tokens will be replaced with the + # language-specific comment definitions defined in the sub-classes. + # + # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most + # one will be non-None. + comment_regex_fmt = r""" + {multi_start} # start of the language-specific multi-line comment (ex. /*) + (?P # Look for a multiline comment + [\d\D]*? # capture all of the characters... + ) + {multi_end} # until you find the end of the language-specific multi-line comment (ex. */) + | # If you don't find any of those... + (?P # Look for a group of single-line comments + (?: # Non-capture mode + {single} # start by finding the single-line comment token (ex. //) + .* # and capture all characters until the end of the line + \n? # followed by an optional carriage return + \ * # and some empty space + )* # multiple times + ) """ - comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)' def __init__(self, config, echo): """ @@ -74,7 +79,12 @@ def __init__(self, config, echo): # pylint: disable=not-a-mapping self.comment_regex = re.compile( - self.comment_regex_fmt.format(**self.lang_comment_definition) + self.comment_regex_fmt.format(**self.lang_comment_definition), + flags=re.VERBOSE + ) + self.prefixed_comment_regex = re.compile( + r"^ *{single}".format(**self.lang_comment_definition), + flags=re.MULTILINE ) # Parent class will allow this class to populate self.strings_to_search via @@ -102,15 +112,15 @@ def search(self, file_handle): if any(anno in txt for anno in self.config.annotation_tokens): fname = clean_abs_path(file_handle.name, self.config.source_path) + # Iterate on all comments: both prefixed- and non-prefixed. for match in self.comment_regex.finditer(txt): - # Should only be one match - comment_content = [item for item in match.groups() if item is not None][0] - for inner_match in self.query.finditer(comment_content): - # Get the line number by counting newlines + 1 (for the first line). - # Note that this is the line number of the beginning of the comment, not the - # annotation token itself. - line = txt.count('\n', 0, match.start()) + 1 + # Get the line number by counting newlines + 1 (for the first line). + # Note that this is the line number of the beginning of the comment, not the + # annotation token itself. + line = txt.count('\n', 0, match.start()) + 1 + comment_content = self._find_comment_content(match) + for inner_match in self.query.finditer(comment_content): try: annotation_token = inner_match.group('token') annotation_data = inner_match.group('data') @@ -131,3 +141,27 @@ def search(self, file_handle): }) return found_annotations + + def _find_comment_content(self, match): + """ + Return the comment content as text. + + Args: + match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression. + """ + comment_content = match.groupdict()["comment"] + if comment_content: + return comment_content + + # Find single-line comments and strip comment tokens + comment_content = match.groupdict()["prefixed_comment"] + return self._strip_single_line_comment_tokens(comment_content) + + def _strip_single_line_comment_tokens(self, content): + """ + Strip the leading single-line comment tokens from a comment text. + + Args: + content (str): token-prefixed multi-line comment string. + """ + return self.prefixed_comment_regex.sub("", content) diff --git a/tests/extensions/python_test_files/multiline_singlelinecomment.pyt b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt new file mode 100644 index 0000000..c59334e --- /dev/null +++ b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt @@ -0,0 +1,7 @@ +# Docstring +#.. pii: A long description that +# spans multiple +# lines +# A comment that is not indented and not part of the above multi-line annotation +#.. pii_types: id, name +# Some comment that comes after the multiple-line annotation diff --git a/tests/extensions/test_base_extensions.py b/tests/extensions/test_base_extensions.py index cc38f1c..78ce501 100644 --- a/tests/extensions/test_base_extensions.py +++ b/tests/extensions/test_base_extensions.py @@ -28,3 +28,19 @@ def test_nothing_found(): r = FakeExtension(config, VerboseEcho()) with open('tests/extensions/base_test_files/empty.foo') as f: r.search(f) + + +def test_strip_single_line_comment_tokens(): + config = FakeConfig() + + extension = FakeExtension(config, VerboseEcho()) + text = """baz line1 + baz line2 +bazline3 +baz line4""" + expected_result = """ line1 + line2 +line3 + line4""" + # pylint: disable=protected-access + assert expected_result == extension._strip_single_line_comment_tokens(text) diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py index b41413b..fc8f7a2 100644 --- a/tests/extensions/test_extension_python.py +++ b/tests/extensions/test_extension_python.py @@ -76,6 +76,15 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me Multi-line and multi-paragraph.""") ] ), + ( + 'multiline_singlelinecomment.pyt', + [ + ('.. pii:', """A long description that + spans multiple + lines"""), + ('.. pii_types:', 'id, name'), + ] + ), ]) def test_multi_line_annotations(test_file, annotations): config = AnnotationConfig('tests/test_configurations/.annotations_test')