diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/tests.yml similarity index 73% rename from .github/workflows/integration_tests.yml rename to .github/workflows/tests.yml index 8e21a9b..6f0dd1b 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Run Python Basics Integration Tests +name: Run Python Basics Integration and Unit Tests on: pull_request: types: [opened, reopened, synchronize] @@ -16,3 +16,7 @@ jobs: run: | cd starpls/integration_tests bazel test //... + - name: Run cr_checker unit tests + run: | + cd cr_checker/tests + bazel test //... diff --git a/cr_checker/tests/.keep b/cr_checker/tests/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/cr_checker/tests/BUILD b/cr_checker/tests/BUILD new file mode 100644 index 0000000..3da53e8 --- /dev/null +++ b/cr_checker/tests/BUILD @@ -0,0 +1,24 @@ +# ******************************************************************************* +# Copyright (c) 2025 Contributors to the Eclipse Foundation +# +# See the NOTICE file(s) distributed with this work for additional +# information regarding copyright ownership. +# +# This program and the accompanying materials are made available under the +# terms of the Apache License Version 2.0 which is available at +# https://www.apache.org/licenses/LICENSE-2.0 +# +# SPDX-License-Identifier: Apache-2.0 +# ******************************************************************************* + +load("@score_tooling//python_basics:defs.bzl", "score_py_pytest") + +score_py_pytest( + name = "shebang_unit_tests", + srcs = [ + "test_shebang_handling.py", + ], + deps = [ + "@score_tooling//cr_checker/tool:cr_checker_lib", + ], +) diff --git a/cr_checker/tests/MODULE.bazel b/cr_checker/tests/MODULE.bazel new file mode 100644 index 0000000..5d3b418 --- /dev/null +++ b/cr_checker/tests/MODULE.bazel @@ -0,0 +1,49 @@ +# ******************************************************************************* +# Copyright (c) 2025 Contributors to the Eclipse Foundation +# +# See the NOTICE file(s) distributed with this work for additional +# information regarding copyright ownership. +# +# This program and the accompanying materials are made available under the +# terms of the Apache License Version 2.0 which is available at +# https://www.apache.org/licenses/LICENSE-2.0 +# +# SPDX-License-Identifier: Apache-2.0 +# ******************************************************************************* +module( + name = "score_cr_checker_tests", + version = "0.1.0", + compatibility_level = 0, +) + +bazel_dep(name = "rules_shell", version = "0.5.0") + +# begin Tests + +# PYTHON +bazel_dep(name = "rules_python", version = "1.4.1") + +PYTHON_VERSION = "3.12" + +python = use_extension("@rules_python//python/extensions:python.bzl", "python") +python.toolchain( + python_version = PYTHON_VERSION, +) +use_repo(python) + +# PIP +pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip") +pip.parse( + hub_name = "pip_deps_test", + python_version = PYTHON_VERSION, + requirements_lock = "//:requirements_lock.txt", +) +use_repo(pip, "pip_deps_test") + +bazel_dep(name = "score_tooling", version = "0.0.0") +local_path_override( + module_name = "score_tooling", + path = "../../", +) + +# end Tests diff --git a/cr_checker/tests/requirements_lock.txt b/cr_checker/tests/requirements_lock.txt new file mode 100644 index 0000000..e504a61 --- /dev/null +++ b/cr_checker/tests/requirements_lock.txt @@ -0,0 +1 @@ +bazel-runfiles==1.3.0 \ No newline at end of file diff --git a/cr_checker/tests/test_shebang_handling.py b/cr_checker/tests/test_shebang_handling.py new file mode 100644 index 0000000..cd9471b --- /dev/null +++ b/cr_checker/tests/test_shebang_handling.py @@ -0,0 +1,169 @@ +# ******************************************************************************* +# Copyright (c) 2024 Contributors to the Eclipse Foundation +# +# See the NOTICE file(s) distributed with this work for additional +# information regarding copyright ownership. +# +# This program and the accompanying materials are made available under the +# terms of the Apache License Version 2.0 which is available at +# https://www.apache.org/licenses/LICENSE-2.0 +# +# SPDX-License-Identifier: Apache-2.0 +# ******************************************************************************* +# unit tests for the shebang handling in the cr_checker module +from __future__ import annotations + +import importlib.util +import json +from datetime import datetime +from pathlib import Path + + +# load the cr_checker module +def load_cr_checker_module(): + module_path = Path(__file__).resolve().parents[1] / "tool" / "cr_checker.py" + spec = importlib.util.spec_from_file_location("cr_checker_module", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Failed to load cr_checker module from {module_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# load the license template +def load_py_template() -> str: + cr_checker = load_cr_checker_module() + template_file = Path(__file__).resolve().parents[1] / "resources" / "templates.ini" + templates = cr_checker.load_templates(template_file) + return templates["py"] + + +# write the config file here so that the year is always up to date with the year +# written in the mock "script.py" file +def write_config(path: Path, years: list[int]) -> Path: + config_path = path / "config.json" + config_path.write_text(json.dumps({"years": years}), encoding="utf-8") + return config_path + + +# test that offset matches the length of the shebang line including trailing newlines +def test_detect_shebang_offset_counts_trailing_newlines(tmp_path): + cr_checker = load_cr_checker_module() + script = tmp_path / "script.py" + script.write_text( + "#!/usr/bin/env python3\n\nprint('hi')\n", + encoding="utf-8", + ) + + offset = cr_checker.detect_shebang_offset(script, "utf-8") + + assert offset == len("#!/usr/bin/env python3\n\n".encode("utf-8")) + + +# test that process_files function validates a license header after the shebang line +def test_process_files_accepts_header_after_shebang(tmp_path): + cr_checker = load_cr_checker_module() + script = tmp_path / "script.py" + header_template = load_py_template() + current_year = datetime.now().year + header = header_template.format(year=current_year) + script.write_text( + "#!/usr/bin/env python3\n" + header + "print('hi')\n", + encoding="utf-8", + ) + config = write_config(tmp_path, [current_year]) + + results = cr_checker.process_files( + [script], + {"py": header_template}, + False, + config, + use_mmap=False, + encoding="utf-8", + offset=0, + remove_offset=0, + ) + + assert results["no_copyright"] == 0 + + +# test that process_files function fixes a missing license header after the shebang line +def test_process_files_fix_inserts_header_after_shebang(tmp_path): + cr_checker = load_cr_checker_module() + script = tmp_path / "script.py" + script.write_text( + "#!/usr/bin/env python3\nprint('hi')\n", + encoding="utf-8", + ) + header_template = load_py_template() + current_year = datetime.now().year + config = write_config(tmp_path, [current_year]) + + results = cr_checker.process_files( + [script], + {"py": header_template}, + True, + config, + use_mmap=False, + encoding="utf-8", + offset=0, + remove_offset=0, + ) + + assert results["fixed"] == 1 + assert results["no_copyright"] == 1 + expected_header = header_template.format(year=current_year) + assert script.read_text(encoding="utf-8") == ( + "#!/usr/bin/env python3\n" + expected_header + "print('hi')\n" + ) + + +# test that process_files function validates a license header without the shebang line +def test_process_files_accepts_header_without_shebang(tmp_path): + cr_checker = load_cr_checker_module() + script = tmp_path / "script.py" + header_template = load_py_template() + current_year = datetime.now().year + header = header_template.format(year=current_year) + script.write_text(header + "print('hi')\n", encoding="utf-8") + config = write_config(tmp_path, [current_year]) + + results = cr_checker.process_files( + [script], + {"py": header_template}, + False, + config, + use_mmap=False, + encoding="utf-8", + offset=0, + remove_offset=0, + ) + + assert results["no_copyright"] == 0 + + +# test that process_files function fixes a missing license header without the shebang +def test_process_files_fix_inserts_header_without_shebang(tmp_path): + cr_checker = load_cr_checker_module() + script = tmp_path / "script.py" + script.write_text("print('hi')\n", encoding="utf-8") + header_template = load_py_template() + current_year = datetime.now().year + config = write_config(tmp_path, [current_year]) + + results = cr_checker.process_files( + [script], + {"py": header_template}, + True, + config, + use_mmap=False, + encoding="utf-8", + offset=0, + remove_offset=0, + ) + + assert results["fixed"] == 1 + assert results["no_copyright"] == 1 + expected_header = header_template.format(year=current_year) + assert script.read_text(encoding="utf-8") == expected_header + "print('hi')\n" diff --git a/cr_checker/tool/cr_checker.py b/cr_checker/tool/cr_checker.py index 294c878..cd41c3b 100644 --- a/cr_checker/tool/cr_checker.py +++ b/cr_checker/tool/cr_checker.py @@ -188,6 +188,39 @@ def configure_logging(log_file_path=None, verbose=False): LOGGER.addHandler(handler) +def detect_shebang_offset(path, encoding): + """ + Detects if a file starts with a shebang (#!) and returns the byte offset + to skip it (length of the first line including newline). + + Args: + path (Path): A `pathlib.Path` object pointing to the file. + encoding (str): Encoding type to use when reading the file. + + Returns: + int: The byte length of the shebang line (including newline) if present, + otherwise 0. + """ + try: + with open(path, "r", encoding=encoding) as handle: + first_line = handle.readline() + if first_line.startswith("#!"): + # Calculate byte length of the first line + byte_length = len(first_line.encode(encoding)) + while True: + next_char = handle.read(1) + if not next_char or next_char not in ("\n", "\r"): + break + byte_length += len(next_char.encode(encoding)) + LOGGER.debug( + "Detected shebang in %s with offset %d bytes", path, byte_length + ) + return byte_length + except (IOError, OSError) as err: + LOGGER.debug("Could not detect shebang in %s: %s", path, err) + return 0 + + def load_text_from_file(path, header_length, encoding, offset): """ Reads the first portion of a file, up to `header_length` characters @@ -210,7 +243,8 @@ def load_text_from_file(path, header_length, encoding, offset): "Reading first %d characters from file: %s [%s]", total_length, path, encoding ) with open(path, "r", encoding=encoding) as handle: - return handle.read(total_length) + content = handle.read(total_length) + return content[offset:] if offset else content def load_text_from_file_with_mmap(path, header_length, encoding, offset): @@ -240,10 +274,10 @@ def load_text_from_file_with_mmap(path, header_length, encoding, offset): ) return "" - LOGGER.debug("Memory mapping first %d bytes from file: %s", header_length, path) + LOGGER.debug("Memory mapping first %d bytes from file: %s", total_length, path) with open(path, "r", encoding=encoding) as handle: with mmap.mmap(handle.fileno(), length=length, access=mmap.ACCESS_READ) as fmap: - return fmap[:header_length].decode(encoding) + return fmap[:length].decode(encoding)[offset:] def has_copyright(path, copyright_text, use_mmap, encoding, offset, config): @@ -414,7 +448,7 @@ def fix_copyright(path, copyright_text, encoding, offset): with open(path, "w", encoding=encoding) as handle: temp.seek(0) if offset > 0: - handle.write(first_line + "\n") + handle.write(first_line) temp.seek(offset) handle.write(copyright_text.format(year=datetime.now().year)) for chunk in iter(lambda: temp.read(4096), ""): @@ -463,11 +497,18 @@ def process_files( "Skipped (no configuration for selected file extension): %s", item ) continue - if not has_copyright(item, templates[key], use_mmap, encoding, offset, config): + + # Automatically detect shebang and use its offset if no manual offset provided + shebang_offset = detect_shebang_offset(item, encoding) + effective_offset = offset + shebang_offset if offset == 0 else offset + + if not has_copyright( + item, templates[key], use_mmap, encoding, effective_offset, config + ): if fix: if remove_offset: remove_old_header(item, encoding, remove_offset) - fix_copyright(item, templates[key], encoding, offset) + fix_copyright(item, templates[key], encoding, effective_offset) results["no_copyright"] += 1 results["fixed"] += 1 else: