11"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2-
2+ import os
33import re
44import hashlib
55import json
66import glob
77import pathlib
88import subprocess
99import typing
10+ from urllib .request import urlopen
11+
12+ CPYTHON_ROOT_DIR = pathlib .Path (__file__ ).parent .parent .parent
1013
1114# Before adding a new entry to this list, double check that
1215# the license expression is a valid SPDX license expression:
@@ -43,15 +46,14 @@ class PackageFiles(typing.NamedTuple):
4346# values to 'exclude' if we create new files within tracked
4447# directories that aren't sourced from third-party packages.
4548PACKAGE_TO_FILES = {
49+ # NOTE: pip's entry in this structure is automatically generated in
50+ # the 'discover_pip_sbom_package()' function below.
4651 "mpdecimal" : PackageFiles (
4752 include = ["Modules/_decimal/libmpdec/**" ]
4853 ),
4954 "expat" : PackageFiles (
5055 include = ["Modules/expat/**" ]
5156 ),
52- "pip" : PackageFiles (
53- include = ["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl" ]
54- ),
5557 "macholib" : PackageFiles (
5658 include = ["Lib/ctypes/macholib/**" ],
5759 exclude = [
@@ -106,13 +108,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
106108 return sorted ([line .split ()[- 1 ] for line in git_check_ignore_lines if line .startswith ("::" )])
107109
108110
111+ def discover_pip_sbom_package (sbom_data : dict [str , typing .Any ]) -> None :
112+ """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
113+ automatable to discover the metadata we need like the version and checksums
114+ so let's do that on behalf of our friends at the PyPA.
115+ """
116+ global PACKAGE_TO_FILES , CPYTHON_ROOT_DIR
117+
118+ ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
119+ pip_wheels = []
120+
121+ # Find the hopefully one pip wheel in the bundled directory.
122+ for wheel_filename in os .listdir (ensurepip_bundled_dir ):
123+ if wheel_filename .startswith ("pip-" ):
124+ pip_wheels .append (wheel_filename )
125+ if len (pip_wheels ) != 1 :
126+ print ("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'" )
127+ exit (1 )
128+ pip_wheel_filename = pip_wheels [0 ]
129+
130+ # Add the wheel filename to the list of files so the SBOM file
131+ # and relationship generator can work its magic on the wheel too.
132+ PACKAGE_TO_FILES ["pip" ] = PackageFiles (
133+ include = [f"Lib/ensurepip/_bundled/{ pip_wheel_filename } " ]
134+ )
135+
136+ # Wheel filename format puts the version right after the project name.
137+ pip_version = pip_wheel_filename .split ("-" )[1 ]
138+ pip_checksum_sha256 = hashlib .sha256 (
139+ (ensurepip_bundled_dir / pip_wheel_filename ).read_bytes ()
140+ ).hexdigest ()
141+
142+ # Get pip's download location from PyPI. Check that the checksum is correct too.
143+ try :
144+ raw_text = urlopen (f"https://pypi.org/pypi/pip/{ pip_version } /json" ).read ()
145+ pip_release_metadata = json .loads (raw_text )
146+ url : dict [str , typing .Any ]
147+
148+ # Look for a matching artifact filename and then check
149+ # its remote checksum to the local one.
150+ for url in pip_release_metadata ["urls" ]:
151+ if url ["filename" ] == pip_wheel_filename :
152+ break
153+ else :
154+ raise ValueError (f"No matching filename on PyPI for '{ pip_wheel_filename } '" )
155+ if url ["digests" ]["sha256" ] != pip_checksum_sha256 :
156+ raise ValueError (f"Local pip checksum doesn't match artifact on PyPI" )
157+
158+ # Successfully found the download URL for the matching artifact.
159+ pip_download_url = url ["url" ]
160+
161+ except (OSError , ValueError ) as e :
162+ print (f"Couldn't fetch pip's metadata from PyPI: { e } " )
163+ exit (1 )
164+
165+ # Remove pip from the existing SBOM packages if it's there
166+ # and then overwrite its entry with our own generated one.
167+ sbom_data ["packages" ] = [
168+ sbom_package
169+ for sbom_package in sbom_data ["packages" ]
170+ if sbom_package ["name" ] != "pip"
171+ ]
172+ sbom_data ["packages" ].append (
173+ {
174+ "SPDXID" : spdx_id ("SPDXRef-PACKAGE-pip" ),
175+ "name" : "pip" ,
176+ "versionInfo" : pip_version ,
177+ "originator" : "Organization: Python Packaging Authority" ,
178+ "licenseConcluded" : "MIT" ,
179+ "downloadLocation" : pip_download_url ,
180+ "checksums" : [
181+ {"algorithm" : "sha256" , "checksumValue" : pip_checksum_sha256 }
182+ ],
183+ "externalRefs" : [
184+ {
185+ "referenceCategory" : "SECURITY" ,
186+ "referenceLocator" : f"cpe:2.3:a:pypa:pip:{ pip_version } :*:*:*:*:*:*:*" ,
187+ "referenceType" : "cpe23Type" ,
188+ },
189+ {
190+ "referenceCategory" : "PACKAGE_MANAGER" ,
191+ "referenceLocator" : f"pkg:pypi/pip@{ pip_version } " ,
192+ "referenceType" : "purl" ,
193+ },
194+ ],
195+ "primaryPackagePurpose" : "SOURCE" ,
196+ }
197+ )
198+
199+
109200def main () -> None :
110- root_dir = pathlib .Path (__file__ ).parent .parent .parent
111- sbom_path = root_dir / "Misc/sbom.spdx.json"
201+ sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
112202 sbom_data = json .loads (sbom_path .read_bytes ())
113203
114- # Make a bunch of assertions about the SBOM data to ensure it's consistent.
204+ # Insert pip's SBOM metadata from the wheel.
205+ discover_pip_sbom_package (sbom_data )
206+
207+ # Ensure all packages in this tool are represented also in the SBOM file.
115208 assert {package ["name" ] for package in sbom_data ["packages" ]} == set (PACKAGE_TO_FILES )
209+
210+ # Make a bunch of assertions about the SBOM data to ensure it's consistent.
116211 for package in sbom_data ["packages" ]:
117212
118213 # Properties and ID must be properly formed.
@@ -138,17 +233,17 @@ def main() -> None:
138233 for include in sorted (files .include ):
139234
140235 # Find all the paths and then filter them through .gitignore.
141- paths = glob .glob (include , root_dir = root_dir , recursive = True )
236+ paths = glob .glob (include , root_dir = CPYTHON_ROOT_DIR , recursive = True )
142237 paths = filter_gitignored_paths (paths )
143238 assert paths , include # Make sure that every value returns something!
144239
145240 for path in paths :
146241 # Skip directories and excluded files
147- if not (root_dir / path ).is_file () or path in exclude :
242+ if not (CPYTHON_ROOT_DIR / path ).is_file () or path in exclude :
148243 continue
149244
150245 # SPDX requires SHA1 to be used for files, but we provide SHA256 too.
151- data = (root_dir / path ).read_bytes ()
246+ data = (CPYTHON_ROOT_DIR / path ).read_bytes ()
152247 checksum_sha1 = hashlib .sha1 (data ).hexdigest ()
153248 checksum_sha256 = hashlib .sha256 (data ).hexdigest ()
154249
0 commit comments