From 299f65055efc4cd56624b15d2276d48e03edd48c Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Fri, 9 Jan 2026 09:42:12 +0900 Subject: [PATCH 1/3] fix(pipstar): correctly handle platlib and purelib in .data Some packages like use `platlib` in the data to put the main files. This PR is implementing correct handling of such packages by recursively merging two trees. If we have any collisions, we will print an error and stop. That is unlikely but better to be safe. Users can patch the failure to be a warning if necessary. In order to make this more testable, move the functions to a separate file. Fixes #3500 --- examples/pip_parse/BUILD.bazel | 5 +- examples/pip_parse/requirements.in | 1 + examples/pip_parse/requirements_lock.txt | 12 +++ python/private/pypi/BUILD.bazel | 11 +++ python/private/pypi/whl_extract.bzl | 106 +++++++++++++++++++++++ python/private/pypi/whl_library.bzl | 47 +--------- 6 files changed, 137 insertions(+), 45 deletions(-) create mode 100644 python/private/pypi/whl_extract.bzl diff --git a/examples/pip_parse/BUILD.bazel b/examples/pip_parse/BUILD.bazel index 6ed8d26286..37a25fe873 100644 --- a/examples/pip_parse/BUILD.bazel +++ b/examples/pip_parse/BUILD.bazel @@ -79,5 +79,8 @@ py_test( "WHEEL_DIST_INFO_CONTENTS": "$(rootpaths @pypi//requests:dist_info)", "YAMLLINT_ENTRY_POINT": "$(rlocationpath :yamllint)", }, - deps = ["@rules_python//python/runfiles"], + deps = [ + "@pypi//libclang", + "@rules_python//python/runfiles", + ], ) diff --git a/examples/pip_parse/requirements.in b/examples/pip_parse/requirements.in index 9d9e766d21..e4af3b1efe 100644 --- a/examples/pip_parse/requirements.in +++ b/examples/pip_parse/requirements.in @@ -3,3 +3,4 @@ s3cmd~=2.1.0 yamllint~=1.28.0 sphinx sphinxcontrib-serializinghtml +libclang diff --git a/examples/pip_parse/requirements_lock.txt b/examples/pip_parse/requirements_lock.txt index dc34b45a45..13a2bba1e6 100644 --- a/examples/pip_parse/requirements_lock.txt +++ b/examples/pip_parse/requirements_lock.txt @@ -42,6 +42,18 @@ jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via sphinx +libclang==18.1.1 \ + --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \ + --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \ + --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \ + --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \ + --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \ + --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \ + --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \ + --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \ + --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \ + --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe + # via -r requirements.in markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \ diff --git a/python/private/pypi/BUILD.bazel b/python/private/pypi/BUILD.bazel index aa96cf86a5..8194bb520d 100644 --- a/python/private/pypi/BUILD.bazel +++ b/python/private/pypi/BUILD.bazel @@ -415,6 +415,16 @@ bzl_library( srcs = ["whl_config_setting.bzl"], ) +bzl_library( + name = "whl_extract_bzl", + srcs = ["whl_extract.bzl"], + deps = [ + ":whl_metadata_bzl", + "//python/private:repo_utils_bzl", + "@rules_python_internal//:rules_python_config_bzl", + ], +) + bzl_library( name = "whl_library_alias_bzl", srcs = ["whl_library_alias.bzl"], @@ -435,6 +445,7 @@ bzl_library( ":patch_whl_bzl", ":pep508_requirement_bzl", ":pypi_repo_utils_bzl", + ":whl_extract_bzl", ":whl_metadata_bzl", ":whl_target_platforms_bzl", "//python/private:auth_bzl", diff --git a/python/private/pypi/whl_extract.bzl b/python/private/pypi/whl_extract.bzl new file mode 100644 index 0000000000..cb6b75f076 --- /dev/null +++ b/python/private/pypi/whl_extract.bzl @@ -0,0 +1,106 @@ +"""A simple whl extractor.""" + +load("@rules_python_internal//:rules_python_config.bzl", rp_config = "config") +load("//python/private:repo_utils.bzl", "repo_utils") +load(":whl_metadata.bzl", "find_whl_metadata") + +def whl_extract(rctx, *, whl_path, logger): + """Extract whls in Starlark. + + Args: + rctx: the repository ctx. + whl_path: the whl path to extract. + logger: The logger to use + """ + install_dir_path = whl_path.dirname.get_child("site-packages") + repo_utils.extract( + rctx, + archive = whl_path, + output = install_dir_path, + supports_whl_extraction = rp_config.supports_whl_extraction, + ) + metadata_file = find_whl_metadata( + install_dir = install_dir_path, + logger = logger, + ) + + # Get the .dist_info dir name + dist_info_dir = metadata_file.dirname + rctx.file( + dist_info_dir.get_child("INSTALLER"), + "https://github.com/bazel-contrib/rules_python#pipstar", + ) + repo_root_dir = whl_path.dirname + + # Get the .dist_info dir name + data_dir = dist_info_dir.dirname.get_child(dist_info_dir.basename[:-len(".dist-info")] + ".data") + if data_dir.exists: + for prefix, dest_prefix in { + # https://docs.python.org/3/library/sysconfig.html#posix-prefix + # We are taking this from the legacy whl installer config + "data": "data", + "headers": "include", + # In theory there may be directory collisions here, so it would be best to + # merge the paths here. We are doing for quite a few levels deep. What is + # more, this code has to be reasonably efficient because some packages like + # to not put everything to the top level, but to indicate explicitly if + # something is in `platlib` or `purelib` (e.g. libclang wheel). + "platlib": "site-packages", + "purelib": "site-packages", + "scripts": "bin", + }.items(): + src = data_dir.get_child(prefix) + if not src.exists: + # The prefix does not exist in the wheel, we can continue + continue + + for (src, dest) in merge_trees(src, repo_root_dir.get_child(dest_prefix)): + logger.debug(lambda: "Renaming: {} -> {}".format(src, dest)) + rctx.rename(src, dest) + + # TODO @aignas 2025-12-16: when moving scripts to `bin`, rewrite the #!python + # shebang to be something else, for inspiration look at the hermetic + # toolchain wrappers + + # Ensure that there is no data dir left + rctx.delete(data_dir) + +def merge_trees(src, dest): + """Merge src into the destination path. + + This will attempt to merge-move src files to the destination directory if there are + existing files. Fails at directory depth is 10000 or if there are collisions. + + Args: + src: {type}`path` a src path to rename. + dest: {type}`path` a dest path to rename to. + + Returns: + A list of tuples for src and destination paths. + """ + ret = [] + remaining = [(src, dest)] + collisions = [] + for _ in range(10000): + if collisions or not remaining: + break + + tmp = [] + for (src, dest) in remaining: + if not dest.exists: + ret.append((src, dest)) + continue + + if not src.is_dir: + collisions.append(src) + continue + + for f in src.readdir(): + tmp.append((f, dest.get_child(f.basename))) + + remaining = tmp + + if collisions: + fail(lambda: "detected collisions between platlib and purelib data: {}".format(collisions)) + + return ret diff --git a/python/private/pypi/whl_library.bzl b/python/private/pypi/whl_library.bzl index c368dea733..3c4b6beeaf 100644 --- a/python/private/pypi/whl_library.bzl +++ b/python/private/pypi/whl_library.bzl @@ -26,7 +26,8 @@ load(":parse_whl_name.bzl", "parse_whl_name") load(":patch_whl.bzl", "patch_whl") load(":pep508_requirement.bzl", "requirement") load(":pypi_repo_utils.bzl", "pypi_repo_utils") -load(":whl_metadata.bzl", "find_whl_metadata", "whl_metadata") +load(":whl_extract.bzl", "whl_extract") +load(":whl_metadata.bzl", "whl_metadata") load(":whl_target_platforms.bzl", "whl_target_platforms") _CPPFLAGS = "CPPFLAGS" @@ -265,48 +266,6 @@ def _create_repository_execution_environment(rctx, python_interpreter, logger = env[_CPPFLAGS] = " ".join(cppflags) return env -def _extract_whl_star(rctx, *, whl_path, logger): - install_dir_path = whl_path.dirname.get_child("site-packages") - repo_utils.extract( - rctx, - archive = whl_path, - output = install_dir_path, - supports_whl_extraction = rp_config.supports_whl_extraction, - ) - metadata_file = find_whl_metadata( - install_dir = install_dir_path, - logger = logger, - ) - - # Get the .dist_info dir name - dist_info_dir = metadata_file.dirname - rctx.file( - dist_info_dir.get_child("INSTALLER"), - "https://github.com/bazel-contrib/rules_python#pipstar", - ) - repo_root_dir = whl_path.dirname - - # Get the .dist_info dir name - data_dir = dist_info_dir.dirname.get_child(dist_info_dir.basename[:-len(".dist-info")] + ".data") - if data_dir.exists: - for prefix, dest in { - # https://docs.python.org/3/library/sysconfig.html#posix-prefix - # We are taking this from the legacy whl installer config - "data": "data", - "headers": "include", - "platlib": "site-packages", - "purelib": "site-packages", - "scripts": "bin", - }.items(): - src = data_dir.get_child(prefix) - dest = repo_root_dir.get_child(dest) - if src.exists: - rctx.rename(src, dest) - - # TODO @aignas 2025-12-16: when moving scripts to `bin`, rewrite the #!python - # shebang to be something else, for inspiration look at the hermetic - # toolchain wrappers - def _extract_whl_py(rctx, *, python_interpreter, args, whl_path, environment, logger): target_platforms = rctx.attr.experimental_target_platforms or [] if target_platforms: @@ -448,7 +407,7 @@ def _whl_library_impl(rctx): ) if enable_pipstar_extract: - _extract_whl_star(rctx, whl_path = whl_path, logger = logger) + whl_extract(rctx, whl_path = whl_path, logger = logger) else: _extract_whl_py( rctx, From 5c8d39f6c561cca35d15fa7ed3063ad3eb9676ca Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sat, 10 Jan 2026 00:44:07 +0900 Subject: [PATCH 2/3] add libclang to windows reqs --- examples/pip_parse/requirements_windows.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/pip_parse/requirements_windows.txt b/examples/pip_parse/requirements_windows.txt index 78c1a45690..7a1329d521 100644 --- a/examples/pip_parse/requirements_windows.txt +++ b/examples/pip_parse/requirements_windows.txt @@ -46,6 +46,18 @@ jinja2==3.1.6 \ --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \ --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 # via sphinx +libclang==18.1.1 \ + --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \ + --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \ + --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \ + --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \ + --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \ + --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \ + --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \ + --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \ + --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \ + --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe + # via -r requirements.in markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \ From 00a8c2810100979d3ec0f2cdab20efe70f8b3834 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sat, 10 Jan 2026 00:44:15 +0900 Subject: [PATCH 3/3] better crash handling --- python/private/pypi/whl_extract.bzl | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/private/pypi/whl_extract.bzl b/python/private/pypi/whl_extract.bzl index cb6b75f076..6b2e0507ac 100644 --- a/python/private/pypi/whl_extract.bzl +++ b/python/private/pypi/whl_extract.bzl @@ -86,21 +86,24 @@ def merge_trees(src, dest): break tmp = [] - for (src, dest) in remaining: - if not dest.exists: - ret.append((src, dest)) + for (s, d) in remaining: + if not d.exists: + ret.append((s, d)) continue - if not src.is_dir: - collisions.append(src) + if not s.is_dir or not d.is_dir: + collisions.append(s) continue - for f in src.readdir(): - tmp.append((f, dest.get_child(f.basename))) + for file_or_dir in s.readdir(): + tmp.append((file_or_dir, d.get_child(file_or_dir.basename))) remaining = tmp + if remaining: + fail("Exceeded maximum directory depth of 10000 during tree merge.") + if collisions: - fail(lambda: "detected collisions between platlib and purelib data: {}".format(collisions)) + fail("Detected collisions between {} and {}: {}".format(src, dest, collisions)) return ret