From 27bd2281da4d19089ef1eba6119aa4c846567d32 Mon Sep 17 00:00:00 2001 From: Minhaj Uddin Date: Tue, 18 Mar 2025 18:20:56 +0000 Subject: [PATCH 1/2] Fix for New Firefox releases that are provided in tar.xz instead of tar.bz2 #44 --- bci/browser/binary/vendors/firefox.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/bci/browser/binary/vendors/firefox.py b/bci/browser/binary/vendors/firefox.py index 39c80671..2a667b74 100644 --- a/bci/browser/binary/vendors/firefox.py +++ b/bci/browser/binary/vendors/firefox.py @@ -48,8 +48,22 @@ def download_binary(self): with requests.get(binary_url, stream=True) as req: with open(tar_file_path, 'wb') as file: shutil.copyfileobj(req.raw, file) - with tarfile.open(tar_file_path, "r:bz2") as tar_ref: + + # Determine correct archive format based on version + if int(self.state.version) >= 135: + tar_file_path = f'/tmp/{self.state.name}/archive.tar.xz' + tar_mode = "r:xz" + else: + tar_file_path = f'/tmp/{self.state.name}/archive.tar.bz2' + tar_mode = "r:bz2" + # Download the correct archive + with requests.get(binary_url, stream=True) as req: + with open(tar_file_path, 'wb') as file: + shutil.copyfileobj(req.raw, file) + # Extract the archive using the determined format + with tarfile.open(tar_file_path, tar_mode) as tar_ref: tar_ref.extractall(os.path.dirname(tar_file_path)) + bin_path = self.get_potential_bin_path() os.makedirs(os.path.dirname(bin_path), exist_ok=True) unzipped_folder_path = os.path.join(os.path.dirname(tar_file_path), "firefox") From e14d90de459f265f37557048acb4ab3d24d3d255 Mon Sep 17 00:00:00 2001 From: Gertjan Date: Thu, 27 Mar 2025 16:27:21 +0000 Subject: [PATCH 2/2] Improve download and extract process --- bci/browser/binary/binary.py | 15 +++- bci/browser/binary/vendors/chromium.py | 44 ++--------- bci/browser/binary/vendors/firefox.py | 54 ++----------- bci/database/mongo/binary_cache.py | 4 + bci/util.py | 79 +++++++++++++++++-- .../states/revisions/chromium.py | 10 +-- .../states/revisions/firefox.py | 4 +- bci/version_control/states/state.py | 5 +- .../states/versions/chromium.py | 6 +- .../states/versions/firefox.py | 10 ++- 10 files changed, 125 insertions(+), 106 deletions(-) diff --git a/bci/browser/binary/binary.py b/bci/browser/binary/binary.py index 3a586fb8..5e43591d 100644 --- a/bci/browser/binary/binary.py +++ b/bci/browser/binary/binary.py @@ -102,8 +102,21 @@ def is_available_locally(self): def is_available_online(self): return self.state.has_online_binary() - @abstractmethod def download_binary(self): + if self.is_available_locally(): + logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})') + else: + binary_urls = self.state.get_online_binary_urls() + binary_dst_folder = os.path.dirname(self.get_potential_bin_path()) + util.download_and_extract(binary_urls, binary_dst_folder) + self.configure_binary() + + @abstractmethod + def configure_binary(self): + """ + Configures the browser binary. + This method is idempotent. + """ pass def is_built(self): diff --git a/bci/browser/binary/vendors/chromium.py b/bci/browser/binary/vendors/chromium.py index f7a99a08..4a6a5689 100644 --- a/bci/browser/binary/vendors/chromium.py +++ b/bci/browser/binary/vendors/chromium.py @@ -1,14 +1,11 @@ import logging import os import re -import shutil -import zipfile - -import requests from bci import cli, util from bci.browser.binary.artisanal_manager import ArtisanalBuildManager from bci.browser.binary.binary import Binary +from bci.database.mongo.binary_cache import BinaryCache from bci.version_control.states.state import State logger = logging.getLogger(__name__) @@ -19,7 +16,6 @@ class ChromiumBinary(Binary): - def __init__(self, state: State): super().__init__(state) @@ -38,41 +34,12 @@ def browser_name(self) -> str: def bin_folder_path(self) -> str: return BIN_FOLDER_PATH - # def get_full_version(self, version: int): - # if re.match(r'[0-9]+\.[0-9]+\.[0-9]+', version): - # return version + ".0" - # if re.match(r'[0-9]+', version): - # return self.repo.get_release_tag(version) - # if re.match(r'[0-9]{2}', version): - # return self.full_versions[version] + ".0" - # raise AttributeError("Could not convert version '%i' to full version" % version) - # return self.repo.get_release_tag(version) - # Downloadable binaries - def download_binary(self): - if self.is_available_locally(): - logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})') - return - binary_url = self.state.get_online_binary_url() - logger.info(f'Downloading binary for {self.state} from \'{binary_url}\'') - zip_file_path = f'/tmp/{self.state.name}/archive.zip' - if os.path.exists(os.path.dirname(zip_file_path)): - shutil.rmtree(os.path.dirname(zip_file_path)) - os.makedirs(os.path.dirname(zip_file_path)) - with requests.get(binary_url, stream=True) as req: - with open(zip_file_path, 'wb') as file: - shutil.copyfileobj(req.raw, file) - with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: - zip_ref.extractall(os.path.dirname(zip_file_path)) - bin_path = self.get_potential_bin_path() - os.makedirs(os.path.dirname(bin_path), exist_ok=True) - unzipped_folder_path = os.path.join(os.path.dirname(zip_file_path), "chrome-linux") - self.__remove_unnecessary_files(unzipped_folder_path) - util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path)) - cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path)) - # Remove temporary files in /tmp/COMMIT_POS - shutil.rmtree(os.path.dirname(zip_file_path)) + def configure_binary(self): + binary_folder = os.path.dirname(self.get_potential_bin_path()) + self.__remove_unnecessary_files(binary_folder) + cli.execute_and_return_status(f'chmod -R a+x {binary_folder}') def __remove_unnecessary_files(self, binary_folder_path: str) -> None: """ @@ -90,6 +57,7 @@ def _get_version(self) -> str: if bin_path := self.get_bin_path(): output = cli.execute_and_return_output(command, cwd=os.path.dirname(bin_path)) else: + BinaryCache.remove_binary_files(self.state) raise AttributeError(f'Could not get binary path for {self.state}') match = re.match(r'Chromium (?P[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)', output) if match: diff --git a/bci/browser/binary/vendors/firefox.py b/bci/browser/binary/vendors/firefox.py index 2a667b74..e2510d7d 100644 --- a/bci/browser/binary/vendors/firefox.py +++ b/bci/browser/binary/vendors/firefox.py @@ -1,12 +1,8 @@ import logging import os import re -import shutil -import tarfile -import requests - -from bci import cli, util +from bci import cli from bci.browser.binary.artisanal_manager import ArtisanalBuildManager from bci.browser.binary.binary import Binary from bci.version_control.states.state import State @@ -19,7 +15,6 @@ class FirefoxBinary(Binary): - def __init__(self, state: State): super().__init__(state) @@ -35,50 +30,17 @@ def browser_name(self) -> str: def bin_folder_path(self) -> str: return BIN_FOLDER_PATH - def download_binary(self): - if self.is_available_locally(): - logger.debug(f'Binary for {self.state} was already downloaded ({self.get_bin_path()})') - return - binary_url = self.state.get_online_binary_url() - logger.debug(f'Downloading binary for {self.state} from \'{binary_url}\'') - tar_file_path = f'/tmp/{self.state.name}/archive.tar.bz2' - if os.path.exists(os.path.dirname(tar_file_path)): - shutil.rmtree(os.path.dirname(tar_file_path)) - os.makedirs(os.path.dirname(tar_file_path)) - with requests.get(binary_url, stream=True) as req: - with open(tar_file_path, 'wb') as file: - shutil.copyfileobj(req.raw, file) - - # Determine correct archive format based on version - if int(self.state.version) >= 135: - tar_file_path = f'/tmp/{self.state.name}/archive.tar.xz' - tar_mode = "r:xz" - else: - tar_file_path = f'/tmp/{self.state.name}/archive.tar.bz2' - tar_mode = "r:bz2" - # Download the correct archive - with requests.get(binary_url, stream=True) as req: - with open(tar_file_path, 'wb') as file: - shutil.copyfileobj(req.raw, file) - # Extract the archive using the determined format - with tarfile.open(tar_file_path, tar_mode) as tar_ref: - tar_ref.extractall(os.path.dirname(tar_file_path)) - - bin_path = self.get_potential_bin_path() - os.makedirs(os.path.dirname(bin_path), exist_ok=True) - unzipped_folder_path = os.path.join(os.path.dirname(tar_file_path), "firefox") - util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path)) - cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path)) - cli.execute_and_return_status("chmod -R a+w %s" % os.path.dirname(bin_path)) - # Remove temporary files in /tmp/COMMIT_POS - shutil.rmtree(os.path.dirname(tar_file_path)) + def configure_binary(self) -> None: + binary_folder = os.path.dirname(self.get_potential_bin_path()) + cli.execute_and_return_status(f'chmod -R a+x {binary_folder}') + cli.execute_and_return_status(f'chmod -R a+w {binary_folder}') # Add policy.json to prevent updating. (this measure is effective from version 60) # https://github.com/mozilla/policy-templates/blob/master/README.md # (For earlier versions, the prefs.js file is used) - distributions_path = os.path.join(os.path.dirname(bin_path), "distribution") + distributions_path = os.path.join(binary_folder, 'distribution') os.makedirs(distributions_path, exist_ok=True) - policies_path = os.path.join(distributions_path, "policies.json") - with open(policies_path, "a") as file: + policies_path = os.path.join(distributions_path, 'policies.json') + with open(policies_path, 'a') as file: file.write('{ "policies": { "DisableAppUpdate": true } }') def _get_version(self): diff --git a/bci/database/mongo/binary_cache.py b/bci/database/mongo/binary_cache.py index bf19086c..9c832250 100644 --- a/bci/database/mongo/binary_cache.py +++ b/bci/database/mongo/binary_cache.py @@ -135,6 +135,10 @@ def store_file(file_path: str) -> None: elapsed_time = time.time() - start_time logger.debug(f'Stored binary in {elapsed_time:.2f}s') + @staticmethod + def remove_binary_files(state: State) -> None: + BinaryCache.__remove_revision_binary_files(state.type, state.index) + @staticmethod def __count_cached_binaries(state_type: Optional[str] = None) -> int: """ diff --git a/bci/util.py b/bci/util.py index f91bf5dc..1875b51d 100644 --- a/bci/util.py +++ b/bci/util.py @@ -6,12 +6,15 @@ import logging import os import shutil +import tarfile import time +import zipfile from typing import Optional +from urllib.parse import urlparse import requests -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) def safe_move_file(src_path, dst_path): @@ -37,6 +40,7 @@ def safe_move_dir(src_path, dst_path): safe_move_dir(new_src_path, new_dst_path) else: raise AttributeError("Something went wrong") + shutil.rmtree(src_path) def copy_folder(src_path, dst_path): @@ -83,7 +87,7 @@ def read_web_report(file_name): def request_html(url: str): - LOGGER.debug(f"Requesting {url}") + logger.debug(f"Requesting {url}") resp = requests.get(url, timeout=60) if resp.status_code >= 400: raise PageNotFound(f"Could not connect to url '{url}'") @@ -91,22 +95,85 @@ def request_html(url: str): def request_json(url: str): - LOGGER.debug(f"Requesting {url}") + logger.debug(f"Requesting {url}") resp = requests.get(url, timeout=60) if resp.status_code >= 400: raise PageNotFound(f"Could not connect to url '{url}'") - LOGGER.debug('Request completed') + logger.debug('Request completed') return resp.json() def request_final_url(url: str) -> str: - LOGGER.debug(f"Requesting {url}") + logger.debug(f"Requesting {url}") resp = requests.get(url, timeout=60) if resp.status_code >= 400: raise PageNotFound(f"Could not connect to url '{url}'") - LOGGER.debug('Request completed') + logger.debug('Request completed') return resp.url +def download_and_extract(urls: list[str], dst_folder_path: str) -> bool: + """ + Downloads the archive residing at the given URL and extracts it to the given dest_path. + This method currently supports zip, tar.bz2 and tar.xz archives. + + :return bool: Returns True if the archive was successfully downloaded and extracted, otherwise False. + """ + for url in urls: + logger.debug(f"Attempting to download archive from '{url}'") + tmp_file_name = urlparse(url).path.split('/')[-1] + tmp_file_path = os.path.join('/tmp', tmp_file_name) + if os.path.exists(tmp_file_path): + os.remove(tmp_file_path) + with requests.get(url, stream=True) as req: + if req.status_code != 200: + continue + with open(tmp_file_path, 'wb') as file: + shutil.copyfileobj(req.raw, file) + _, file_extension = os.path.splitext(tmp_file_path) + + logger.debug(f"Extracting downloaded archive '{tmp_file_path}'") + match file_extension: + case '.zip': + unzip(tmp_file_path, dst_folder_path) + case '.bz2': + untar(tmp_file_path, dst_folder_path) + case '.xz': + untar(tmp_file_path, dst_folder_path) + case _: + AttributeError(f"File extension {file_extension} is not supported.") + os.remove(tmp_file_path) + return True + return False + + +def unzip(src_archive_path: str, dst_folder_path: str) -> None: + with zipfile.ZipFile(src_archive_path, 'r') as zip: + members = zip.namelist() + top_dirs_and_files = {name.split('/')[0] for name in members} + # If there is a single top-level directory, we move all contents up. + if len(top_dirs_and_files) == 1: + parent_folder_path = os.path.dirname(dst_folder_path) + zip.extractall(parent_folder_path) + safe_move_dir(os.path.join(parent_folder_path, top_dirs_and_files.pop()), dst_folder_path) + else: + os.makedirs(dst_folder_path, exist_ok=True) + zip.extractall(dst_folder_path) + + +def untar(src_archive_path: str, dst_folder_path: str) -> None: + os.makedirs(dst_folder_path, exist_ok=True) + # We do not inspects contents first like in unzip, because this is a very costly operation for tar archives. + with tarfile.open(src_archive_path, 'r:*') as tar: + tar.extractall(dst_folder_path) + members = os.listdir(dst_folder_path) + top_dirs_and_files = {name.split('/')[0] for name in members} + # If there is a single top-level directory, we move all contents up. + if len(top_dirs_and_files) == 1: + safe_move_dir(os.path.join(dst_folder_path, members.pop()), dst_folder_path + '_2') + shutil.rmtree(dst_folder_path) + safe_move_dir(os.path.join(dst_folder_path + '_2'), dst_folder_path) + + class PageNotFound(Exception): pass diff --git a/bci/version_control/states/revisions/chromium.py b/bci/version_control/states/revisions/chromium.py index 7939e956..7be60106 100644 --- a/bci/version_control/states/revisions/chromium.py +++ b/bci/version_control/states/revisions/chromium.py @@ -22,16 +22,16 @@ def has_online_binary(self) -> bool: if cached_binary_available_online is not None: return cached_binary_available_online url = f'https://www.googleapis.com/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F{self._revision_nb}%2Fchrome-linux.zip' - req = requests.get(url) - has_binary_online = req.status_code == 200 + response = requests.get(url, stream=True) + has_binary_online = response.status_code == 200 MongoDB().store_binary_availability_online_cache('chromium', self, has_binary_online) return has_binary_online - def get_online_binary_url(self): - return ( + def get_online_binary_urls(self) -> list[str]: + return [( 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/%s%%2F%s%%2Fchrome-%s.zip?alt=media' % ('Linux_x64', self._revision_nb, 'linux') - ) + )] def _fetch_missing_data(self) -> None: """ diff --git a/bci/version_control/states/revisions/firefox.py b/bci/version_control/states/revisions/firefox.py index 928864b5..370a82b5 100644 --- a/bci/version_control/states/revisions/firefox.py +++ b/bci/version_control/states/revisions/firefox.py @@ -26,14 +26,14 @@ def browser_name(self) -> str: def has_online_binary(self) -> bool: return RevisionCache.firefox_has_binary_for(revision_nb=self.revision_nb, revision_id=self._revision_id) - def get_online_binary_url(self) -> str: + def get_online_binary_urls(self) -> list[str]: result = RevisionCache.firefox_get_binary_info(self._revision_id) if result is None: raise AttributeError(f"Could not find binary url for '{self._revision_id}") binary_base_url = result['files_url'] app_version = result['app_version'] binary_url = f'{binary_base_url}firefox-{app_version}.en-US.linux-x86_64.tar.bz2' - return binary_url + return [binary_url] def get_previous_and_next_state_with_binary(self) -> tuple[State, State]: previous_revision_nb, next_revision_nb = RevisionCache.firefox_get_previous_and_next_revision_nb_with_binary( diff --git a/bci/version_control/states/state.py b/bci/version_control/states/state.py index a9d2e1b1..8fe09fa6 100644 --- a/bci/version_control/states/state.py +++ b/bci/version_control/states/state.py @@ -139,7 +139,10 @@ def has_online_binary(self) -> bool: pass @abstractmethod - def get_online_binary_url(self) -> str: + def get_online_binary_urls(self) -> list[str]: + """ + Returns a list of URLs where the associated binary can potentially be downloaded from. + """ pass def has_available_binary(self) -> bool: diff --git a/bci/version_control/states/versions/chromium.py b/bci/version_control/states/versions/chromium.py index 2374729c..7dc986e1 100644 --- a/bci/version_control/states/versions/chromium.py +++ b/bci/version_control/states/versions/chromium.py @@ -30,11 +30,11 @@ def has_online_binary(self): MongoDB().store_binary_availability_online_cache('chromium', self, has_binary_online) return has_binary_online - def get_online_binary_url(self): - return ( + def get_online_binary_urls(self) -> list[str]: + return [( 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/%s%%2F%s%%2Fchrome-%s.zip?alt=media' % ('Linux_x64', self._revision_nb, 'linux') - ) + )] def convert_to_revision(self) -> ChromiumRevision: return ChromiumRevision(revision_nb=self._revision_nb) diff --git a/bci/version_control/states/versions/firefox.py b/bci/version_control/states/versions/firefox.py index 67463f08..c48bba39 100644 --- a/bci/version_control/states/versions/firefox.py +++ b/bci/version_control/states/versions/firefox.py @@ -1,10 +1,9 @@ -from bci.version_control.repository.online.firefox import get_release_revision_number, get_release_revision_id +from bci.version_control.repository.online.firefox import get_release_revision_id, get_release_revision_number from bci.version_control.states.revisions.firefox import FirefoxRevision from bci.version_control.states.versions.base import BaseVersion class FirefoxVersion(BaseVersion): - def __init__(self, major_version: int): super().__init__(major_version) @@ -21,8 +20,11 @@ def browser_name(self) -> str: def has_online_binary(self) -> bool: return True - def get_online_binary_url(self) -> str: - return f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.bz2' + def get_online_binary_urls(self) -> list[str]: + return [ + f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.bz2', + f'https://ftp.mozilla.org/pub/firefox/releases/{self.major_version}.0/linux-x86_64/en-US/firefox-{self.major_version}.0.tar.xz' + ] def convert_to_revision(self) -> FirefoxRevision: return FirefoxRevision(revision_nb=self._revision_nb)