eclipse-score · FScholPer · Nov 13, 2025 · Nov 7, 2025 · Nov 12, 2025
diff --git a/.github/workflows/integration_tests.yml → .github/workflows/tests.yml b/.github/workflows/integration_tests.yml → .github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Run Python Basics Integration Tests
+name: Run Python Basics Integration and Unit Tests
 on:
   pull_request:
     types: [opened, reopened, synchronize]
@@ -16,3 +16,7 @@ jobs:
         run: |
           cd starpls/integration_tests
           bazel test //...
+      - name: Run cr_checker unit tests
+        run: |
+          cd cr_checker/tests
+          bazel test //...
diff --git a/cr_checker/tests/.keep b/cr_checker/tests/.keep
diff --git a/cr_checker/tests/BUILD b/cr_checker/tests/BUILD
@@ -0,0 +1,24 @@
+# *******************************************************************************
+# Copyright (c) 2025 Contributors to the Eclipse Foundation
+#
+# See the NOTICE file(s) distributed with this work for additional
+# information regarding copyright ownership.
+#
+# This program and the accompanying materials are made available under the
+# terms of the Apache License Version 2.0 which is available at
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# SPDX-License-Identifier: Apache-2.0
+# *******************************************************************************
+
+load("@score_tooling//python_basics:defs.bzl", "score_py_pytest")
+
+score_py_pytest(
+    name = "shebang_unit_tests",
+    srcs = [
+        "test_shebang_handling.py",
+    ],
+    deps = [
+        "@score_tooling//cr_checker/tool:cr_checker_lib",
+    ],
+)
diff --git a/cr_checker/tests/MODULE.bazel b/cr_checker/tests/MODULE.bazel
@@ -0,0 +1,49 @@
+# *******************************************************************************
+# Copyright (c) 2025 Contributors to the Eclipse Foundation
+#
+# See the NOTICE file(s) distributed with this work for additional
+# information regarding copyright ownership.
+#
+# This program and the accompanying materials are made available under the
+# terms of the Apache License Version 2.0 which is available at
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# SPDX-License-Identifier: Apache-2.0
+# *******************************************************************************
+module(
+    name = "score_cr_checker_tests",
+    version = "0.1.0",
+    compatibility_level = 0,
+)
+
+bazel_dep(name = "rules_shell", version = "0.5.0")
+
+# begin Tests
+
+# PYTHON
+bazel_dep(name = "rules_python", version = "1.4.1")
+
+PYTHON_VERSION = "3.12"
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python")
+python.toolchain(
+    python_version = PYTHON_VERSION,
+)
+use_repo(python)
+
+# PIP
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
+pip.parse(
+    hub_name = "pip_deps_test",
+    python_version = PYTHON_VERSION,
+    requirements_lock = "//:requirements_lock.txt",
+)
+use_repo(pip, "pip_deps_test")
+
+bazel_dep(name = "score_tooling", version = "0.0.0")
+local_path_override(
+    module_name = "score_tooling",
+    path = "../../",
+)
+
+# end Tests
diff --git a/cr_checker/tests/requirements_lock.txt b/cr_checker/tests/requirements_lock.txt
@@ -0,0 +1 @@
+bazel-runfiles==1.3.0
diff --git a/cr_checker/tests/test_shebang_handling.py b/cr_checker/tests/test_shebang_handling.py
@@ -0,0 +1,169 @@
+# *******************************************************************************
+# Copyright (c) 2024 Contributors to the Eclipse Foundation
+#
+# See the NOTICE file(s) distributed with this work for additional
+# information regarding copyright ownership.
+#
+# This program and the accompanying materials are made available under the
+# terms of the Apache License Version 2.0 which is available at
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# SPDX-License-Identifier: Apache-2.0
+# *******************************************************************************
+# unit tests for the shebang handling in the cr_checker module
+from __future__ import annotations
+
+import importlib.util
+import json
+from datetime import datetime
+from pathlib import Path
+
+
+# load the cr_checker module
+def load_cr_checker_module():
+    module_path = Path(__file__).resolve().parents[1] / "tool" / "cr_checker.py"
+    spec = importlib.util.spec_from_file_location("cr_checker_module", module_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to load cr_checker module from {module_path}")
+
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# load the license template
+def load_py_template() -> str:
+    cr_checker = load_cr_checker_module()
+    template_file = Path(__file__).resolve().parents[1] / "resources" / "templates.ini"
+    templates = cr_checker.load_templates(template_file)
+    return templates["py"]
+
+
+# write the config file here so that the year is always up to date with the year
+# written in the mock "script.py" file
+def write_config(path: Path, years: list[int]) -> Path:
+    config_path = path / "config.json"
+    config_path.write_text(json.dumps({"years": years}), encoding="utf-8")
+    return config_path
+
+
+# test that offset matches the length of the shebang line including trailing newlines
+def test_detect_shebang_offset_counts_trailing_newlines(tmp_path):
+    cr_checker = load_cr_checker_module()
+    script = tmp_path / "script.py"
+    script.write_text(
+        "#!/usr/bin/env python3\n\nprint('hi')\n",
+        encoding="utf-8",
+    )
+
+    offset = cr_checker.detect_shebang_offset(script, "utf-8")
+
+    assert offset == len("#!/usr/bin/env python3\n\n".encode("utf-8"))
+
+
+# test that process_files function validates a license header after the shebang line
+def test_process_files_accepts_header_after_shebang(tmp_path):
+    cr_checker = load_cr_checker_module()
+    script = tmp_path / "script.py"
+    header_template = load_py_template()
+    current_year = datetime.now().year
+    header = header_template.format(year=current_year)
+    script.write_text(
+        "#!/usr/bin/env python3\n" + header + "print('hi')\n",
+        encoding="utf-8",
+    )
+    config = write_config(tmp_path, [current_year])
+
+    results = cr_checker.process_files(
+        [script],
+        {"py": header_template},
+        False,
+        config,
+        use_mmap=False,
+        encoding="utf-8",
+        offset=0,
+        remove_offset=0,
+    )
+
+    assert results["no_copyright"] == 0
+
+
+# test that process_files function fixes a missing license header after the shebang line
+def test_process_files_fix_inserts_header_after_shebang(tmp_path):
+    cr_checker = load_cr_checker_module()
+    script = tmp_path / "script.py"
+    script.write_text(
+        "#!/usr/bin/env python3\nprint('hi')\n",
+        encoding="utf-8",
+    )
+    header_template = load_py_template()
+    current_year = datetime.now().year
+    config = write_config(tmp_path, [current_year])
+
+    results = cr_checker.process_files(
+        [script],
+        {"py": header_template},
+        True,
+        config,
+        use_mmap=False,
+        encoding="utf-8",
+        offset=0,
+        remove_offset=0,
+    )
+
+    assert results["fixed"] == 1
+    assert results["no_copyright"] == 1
+    expected_header = header_template.format(year=current_year)
+    assert script.read_text(encoding="utf-8") == (
+        "#!/usr/bin/env python3\n" + expected_header + "print('hi')\n"
+    )
+
+
+# test that process_files function validates a license header without the shebang line
+def test_process_files_accepts_header_without_shebang(tmp_path):
+    cr_checker = load_cr_checker_module()
+    script = tmp_path / "script.py"
+    header_template = load_py_template()
+    current_year = datetime.now().year
+    header = header_template.format(year=current_year)
+    script.write_text(header + "print('hi')\n", encoding="utf-8")
+    config = write_config(tmp_path, [current_year])
+
+    results = cr_checker.process_files(
+        [script],
+        {"py": header_template},
+        False,
+        config,
+        use_mmap=False,
+        encoding="utf-8",
+        offset=0,
+        remove_offset=0,
+    )
+
+    assert results["no_copyright"] == 0
+
+
+# test that process_files function fixes a missing license header without the shebang
+def test_process_files_fix_inserts_header_without_shebang(tmp_path):
+    cr_checker = load_cr_checker_module()
+    script = tmp_path / "script.py"
+    script.write_text("print('hi')\n", encoding="utf-8")
+    header_template = load_py_template()
+    current_year = datetime.now().year
+    config = write_config(tmp_path, [current_year])
+
+    results = cr_checker.process_files(
+        [script],
+        {"py": header_template},
+        True,
+        config,
+        use_mmap=False,
+        encoding="utf-8",
+        offset=0,
+        remove_offset=0,
+    )
+
+    assert results["fixed"] == 1
+    assert results["no_copyright"] == 1
+    expected_header = header_template.format(year=current_year)
+    assert script.read_text(encoding="utf-8") == expected_header + "print('hi')\n"
diff --git a/cr_checker/tool/cr_checker.py b/cr_checker/tool/cr_checker.py
@@ -188,6 +188,39 @@ def configure_logging(log_file_path=None, verbose=False):
     LOGGER.addHandler(handler)
 
 
+def detect_shebang_offset(path, encoding):
+    """
+    Detects if a file starts with a shebang (#!) and returns the byte offset
+    to skip it (length of the first line including newline).
+
+    Args:
+        path (Path): A `pathlib.Path` object pointing to the file.
+        encoding (str): Encoding type to use when reading the file.
+
+    Returns:
+        int: The byte length of the shebang line (including newline) if present,
+             otherwise 0.
+    """
+    try:
+        with open(path, "r", encoding=encoding) as handle:
+            first_line = handle.readline()
+            if first_line.startswith("#!"):
+                # Calculate byte length of the first line
+                byte_length = len(first_line.encode(encoding))
+                while True:
+                    next_char = handle.read(1)
+                    if not next_char or next_char not in ("\n", "\r"):
+                        break
+                    byte_length += len(next_char.encode(encoding))
+                LOGGER.debug(
+                    "Detected shebang in %s with offset %d bytes", path, byte_length
+                )
+                return byte_length
+    except (IOError, OSError) as err:
+        LOGGER.debug("Could not detect shebang in %s: %s", path, err)
+    return 0
+
+
 def load_text_from_file(path, header_length, encoding, offset):
     """
     Reads the first portion of a file, up to `header_length` characters
@@ -210,7 +243,8 @@ def load_text_from_file(path, header_length, encoding, offset):
         "Reading first %d characters from file: %s [%s]", total_length, path, encoding
     )
     with open(path, "r", encoding=encoding) as handle:
-        return handle.read(total_length)
+        content = handle.read(total_length)
+        return content[offset:] if offset else content
 
 
 def load_text_from_file_with_mmap(path, header_length, encoding, offset):
@@ -240,10 +274,10 @@ def load_text_from_file_with_mmap(path, header_length, encoding, offset):
         )
         return ""
 
-    LOGGER.debug("Memory mapping first %d bytes from file: %s", header_length, path)
+    LOGGER.debug("Memory mapping first %d bytes from file: %s", total_length, path)
     with open(path, "r", encoding=encoding) as handle:
         with mmap.mmap(handle.fileno(), length=length, access=mmap.ACCESS_READ) as fmap:
-            return fmap[:header_length].decode(encoding)
+            return fmap[:length].decode(encoding)[offset:]
 
 
 def has_copyright(path, copyright_text, use_mmap, encoding, offset, config):
@@ -414,7 +448,7 @@ def fix_copyright(path, copyright_text, encoding, offset):
         with open(path, "w", encoding=encoding) as handle:
             temp.seek(0)
             if offset > 0:
-                handle.write(first_line + "\n")
+                handle.write(first_line)
                 temp.seek(offset)
             handle.write(copyright_text.format(year=datetime.now().year))
             for chunk in iter(lambda: temp.read(4096), ""):
@@ -463,11 +497,18 @@ def process_files(
                 "Skipped (no configuration for selected file extension): %s", item
             )
             continue
-        if not has_copyright(item, templates[key], use_mmap, encoding, offset, config):
+
+        # Automatically detect shebang and use its offset if no manual offset provided
+        shebang_offset = detect_shebang_offset(item, encoding)
+        effective_offset = offset + shebang_offset if offset == 0 else offset
+
+        if not has_copyright(
+            item, templates[key], use_mmap, encoding, effective_offset, config
+        ):
             if fix:
                 if remove_offset:
                     remove_old_header(item, encoding, remove_offset)
-                fix_copyright(item, templates[key], encoding, offset)
+                fix_copyright(item, templates[key], encoding, effective_offset)
                 results["no_copyright"] += 1
                 results["fixed"] += 1
             else: