diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 4fc191b..537196b 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -14,11 +14,12 @@ import time from base64 import b64decode, b64encode +from datetime import datetime from io import BytesIO from logging import LoggerAdapter, Logger from tempfile import NamedTemporaryFile from typing import Any, Literal, TYPE_CHECKING -from collections.abc import MutableMapping +from collections.abc import Awaitable, Callable, MutableMapping from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit, parse_qs, unquote_plus from zipfile import ZipFile @@ -143,7 +144,8 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, socks5_dns_resolver: str | list[str] | None=None, general_timeout_in_sec: int | None=None, loglevel: str | int='INFO', uuid: str | None=None, headless: bool=True, - *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None): + *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None, + display: str | None=None): """Captures a page with Playwright. :param browser: The browser to use for the capture. @@ -153,8 +155,10 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, :param general_timeout_in_sec: The general timeout for the capture, including children. :param loglevel: Python loglevel :param uuid: The UUID of the capture. - :param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment. - :param init_script: An optional JavaScript that will be executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script + :param headless: Whether to run the browser in headless mode. Set to False only when a graphical environment is available. + :param init_script: An optional JavaScript executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script + :param tt_settings: Optional trusted-timestamp configuration used to timestamp capture artifacts. + :param display: Optional X11 display passed to the browser subprocess. Used by interactive headed captures to isolate concurrent sessions. """ master_logger = logging.getLogger('playwrightcapture') master_logger.setLevel(loglevel) @@ -208,6 +212,14 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, self._init_script = init_script self.tt_settings = tt_settings + # X11 display to use for the browser subprocess. Passed via env so each + # concurrent capture gets its own display without mutating os.environ. + self._display = display + + # Per-page capture state populated by setup_page_capture(). + self._multiple_downloads: list[tuple[str, bytes]] = [] + self._store_request: Callable[[Request], Awaitable[None]] | None = None + self._mark_favicons_done: Callable[[], None] | None = None # Initialize the magic DB self.magicdb = MagicDb() @@ -226,11 +238,13 @@ def __prepare_proxy_aiohttp(self, proxy: ProxySettings) -> str: return proxy['server'] async def __aenter__(self) -> Capture: - '''Launch the browser''' - # Ignore the fonts by the time we take the screenshot + """Launch Playwright and the configured browser for this capture.""" + + # Do not wait for webfonts before taking screenshots. # 2026-02-02: the environment is copied into the process when initialized, so we need to set it globally here, # and not in the method where we take the screenshot os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1' + self.playwright = await async_playwright().start() if self.device_name: @@ -247,11 +261,20 @@ async def __aenter__(self) -> Capture: '--unsafely-treat-insecure-origin-as-secure', # Allows to run crypto API on .onion URLs (See https://github.com/Lookyloo/PlaywrightCapture/issues/65) ] + # Build a per-launch environment so concurrent captures each target + # their own X11 display without mutating the process-global DISPLAY. + launch_env: dict[str, str | float | bool] | None = None + if self._display: + launch_env = {**os.environ, 'DISPLAY': self._display} + self.logger.info(f'Launching browser on DISPLAY {self._display}') + else: + self.logger.info(f'Launching browser on default DISPLAY {os.environ.get("DISPLAY", "")}') self.browser = await self.playwright[self.browser_name].launch( proxy=self.proxy if self.proxy else None, channel="chromium" if self.browser_name == "chromium" else None, args=args, - headless=self.headless + headless=self.headless, + env=launch_env, ) # Set of URLs that were captured in that context @@ -263,6 +286,7 @@ async def __aenter__(self) -> Capture: return self async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: + """Close browser resources and suppress exceptions like the upstream context manager.""" try: await self.browser.close(reason="Closing browser at the end of the capture.") @@ -283,6 +307,112 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return True + async def setup_page_capture(self, page: Page, *, allow_tracking: bool=False) -> None: + """Prepare a page for a single-page capture without changing capture semantics. + + This method preserves the existing per-page setup used by capture_page: + download tracking, request body storage for image responses, dialog + acceptance, and the PDF download workaround in headless Chromium. + Interactive sessions reuse it so the operator-driven session can still + finalize like a normal single-page capture later on. + """ + got_favicons = False + + # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture + # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't + self.wait_for_download = 0 + + # We may have multiple download triggered via JS + self._multiple_downloads = [] + + async def handle_download(download: Download) -> None: + # This method is called when a download event is triggered from JS in a page that also renders + try: + self.wait_for_download += 1 + with NamedTemporaryFile() as tmp_f: + self.logger.info('Got a download triggered from JS.') + await download.save_as(tmp_f.name) + filename = download.suggested_filename + with open(tmp_f.name, "rb") as f: + file_content = f.read() + self._multiple_downloads.append((filename, file_content)) + self.logger.info('Done with download.') + except Exception as e: + if download.page.is_closed(): + # Page is closed, skip logging. + pass + else: + self.logger.warning(f'Unable to finish download triggered from JS: {e}') + finally: + self.wait_for_download -= 1 + + async def store_request(request: Request) -> None: + # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL + if got_favicons or request.resource_type != 'image': + return + try: + if response := await request.response(): + if got_favicons: + return + if request.resource_type == 'image' and response.ok: + try: + if body := await response.body(): + m = self.magicdb.best_magic_buffer(body) + if m.mime_type.startswith('image'): + self._requests[request.url] = body + except Exception: + pass + except Exception as e: + self.logger.info(f'Unable to store request: {e}') + + def mark_favicons_done() -> None: + nonlocal got_favicons + got_favicons = True + + if self.browser_name == 'chromium' and self.headless: + async def _override_content_disposition_handler(route: Route, request: Request) -> None: + """Special case to handle PDF rendered in the browser directly""" + try: + response = await route.fetch() # performs the request + overridden_headers = { + **response.headers, + "content-disposition": 'attachment' + } + self.logger.info('Got a PDF in headless chromium, force download') + await route.fulfill(response=response, headers=overridden_headers) + except Error as e: + self.logger.info(f'Unable to force download: {e}') + await route.continue_() + + # overwrite in chromium in headless mode, to trigger a download + # otherwise it is rendered in the PDF viewer. + try: + await page.route("**/*.pdf", handler=_override_content_disposition_handler) + except Error as e: + self.logger.warning(f'Failed at fetching PDF in headless chromium: {e}') + + if allow_tracking: + # Add authorization clickthroughs + await self.__dialog_didomi_clickthrough(page) + await self.__dialog_onetrust_clickthrough(page) + await self.__dialog_hubspot_clickthrough(page) + await self.__dialog_cookiebot_clickthrough(page) + await self.__dialog_complianz_clickthrough(page) + await self.__dialog_yahoo_clickthrough(page) + await self.__dialog_ppms_clickthrough(page) + await self.__dialog_alert_dialog_clickthrough(page) + await self.__dialog_clickthrough(page) + await self.__dialog_tarteaucitron_clickthrough(page) + + page.set_default_timeout((self._capture_timeout - 2) * 1000) + # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher + self._store_request = store_request + self._mark_favicons_done = mark_favicons_done + + page.on("requestfinished", store_request) + page.on("dialog", lambda dialog: dialog.accept()) + page.on("download", handle_download) + @property def locale(self) -> str: return self._locale @@ -345,25 +475,23 @@ def cookies(self) -> list[Cookie]: return self._cookies @cookies.setter - def cookies(self, cookies: list[Cookie | dict[str, Any]] | None) -> None: + def cookies(self, cookies: list[Cookie | dict[str, Any] | object] | None) -> None: '''Cookies to send along to the initial request. + :param cookies: The cookies, in this format: https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-cookies ''' if not cookies: return - for cookie in cookies: - if not cookie: + for raw_cookie in cookies: + if not raw_cookie: continue - if isinstance(cookie, Cookie): - self._cookies.append(cookie) - elif isinstance(cookie, dict): - try: - self._cookies.append(Cookie.model_validate(cookie)) - except Exception as e: - self.logger.warning(f'Invalid cookie: {e}') - else: - # None, ignore - pass + if isinstance(raw_cookie, Cookie): + self._cookies.append(raw_cookie) + continue + try: + self._cookies.append(Cookie.model_validate(raw_cookie)) + except Exception as e: + self.logger.warning(f'Invalid cookie: {e}') @property def storage(self) -> StorageState: @@ -967,7 +1095,137 @@ async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, fi await self._safe_wait(page) self.logger.debug('Done with waiting.') - async def capture_page(self, url: str, *, max_depth_capture_time: int, + async def _finalize_capture( + self, + *, + page: Page, + to_return: CaptureResponse, + errors: list[str], + with_trusted_timestamps: bool, + ) -> None: + """Common finalization logic for captures (downloads, cookies, storage, HAR, socks5, timestamps).""" + + self.logger.debug('Finishing up capture (helper).') + + # We may have multiple downloads triggered via JS; if so, deduplicate them and, + # when there is more than one, bundle them into a zip stored in-memory. + # This mirrors the behavior previously implemented at the end of capture_page. + if self._multiple_downloads: + if multiple_dls := set(self._multiple_downloads): + if len(multiple_dls) == 1: + dl = multiple_dls.pop() + to_return["downloaded_filename"] = dl[0] + to_return["downloaded_file"] = dl[1] + else: + mem_zip = BytesIO() + to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip' + with ZipFile(mem_zip, 'w') as z: + for i, f_details in enumerate(multiple_dls): + filename, file_content = f_details + z.writestr(f'{i}_{filename}', file_content) + to_return["downloaded_file"] = mem_zip.getvalue() + + # Collect cookies from the context (may time out or fail depending on page state). + try: + async with timeout(15): + # NOTE: Ignore type until we can use python 3.12 + only + # playwrightcapture.capture.SetCookieParam == playwright._impl._api_structures.SetCookieParam + to_return['cookies'] = await self.context.cookies() # type: ignore[typeddict-item] + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to get cookies (timeout).") + errors.append("Unable to get the cookies (timeout).") + self.should_retry = True + except Error as e: + self.logger.warning(f"Unable to get cookies: {e}") + errors.append(f'Unable to get the cookies: {e}') + self.should_retry = True + + # Collect storage state, including IndexedDB, to capture the full browser state. + try: + async with timeout(15): + to_return['storage'] = await self.context.storage_state(indexed_db=True) + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to get storage (timeout).") + errors.append("Unable to get the storage (timeout).") + self.should_retry = True + except Error as e: + self.logger.warning(f"Unable to get the storage: {e}") + errors.append(f'Unable to get the storage: {e}') + self.should_retry = True + + try: + if not page.is_closed(): + # Remove request listener if we set one; best-effort only as it is + # primarily used for favicon extraction and should not break captures. + if self._store_request is not None: + try: + page.remove_listener("requestfinished", self._store_request) + except Exception: + # Best-effort only + pass + + try: + # Give in-flight operations a short grace period, then switch the + # context offline to stop further network activity before closing. + await asyncio.sleep(1) + async with timeout(3): + await self.context.set_offline(True) + self.logger.debug('Page offline.') + except (TimeoutError, asyncio.TimeoutError): + self.logger.debug("Unable switch offline.") + + try: + # Finally close the page itself; failures here are non-fatal but + # are logged to help debug flaky environments. + async with timeout(5): + await page.close(reason="Closing the page because the capture finished.") + self.logger.debug('Page closed.') + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable close page.") + + # Close the context to flush the HAR file to disk, then load it. + async with timeout(30): + await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR + self.logger.debug('Context closed.') + with open(self._temp_harfile.name, 'rb') as _har: + to_return['har'] = orjson.loads(_har.read()) + self.logger.debug('Got HAR.') + + # When using a socks5 proxy, post-process the HAR to resolve IPs via + # the proxy so the stored HAR contains addresses consistent with what + # the proxy saw. + if (to_return.get('har') and self.proxy and self.proxy.get('server') + and self.proxy['server'].startswith('socks5')): + if har := to_return['har']: # Could be None + try: + async with timeout(120): + await self.socks5_resolver(har) + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to resolve all the IPs via the socks5 proxy.") + errors.append("Unable to resolve all the IPs via the socks5 proxy.") + self.should_retry = True + + except (TimeoutError, asyncio.TimeoutError): + # If closing the context or generating the HAR takes too long, the + # capture is considered incomplete but we still return what we have. + self.logger.warning("Unable to close context at the end of the capture.") + errors.append("Unable to close context at the end of the capture.") + self.should_retry = True + except Exception as e: + # Any other unexpected failure while finalizing the capture is logged + # and surfaced as a generic HAR-generation error. + self.logger.warning(f"Other exception while finishing up the capture: {e}.") + errors.append(f'Unable to generate HAR file: {e}') + + if errors: + to_return['error'] = '\n'.join(errors) + if with_trusted_timestamps: + try: + await self._get_trusted_timestamps(to_return) + except Exception as e: + self.logger.warning(f'Unable to get trusted timestamps: {e}') + + async def capture_page(self, url: str | None=None, *, max_depth_capture_time: int, referer: str | None=None, page: Page | None=None, depth: int=0, rendered_hostname_only: bool=True, @@ -975,278 +1233,269 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, with_favicon: bool=False, allow_tracking: bool=False, with_trusted_timestamps: bool=False, + current_page_only: bool=False, final_wait: int=5 ) -> CaptureResponse: + """Capture a URL and optionally recurse into child links. - to_return: CaptureResponse = {} - errors: list[str] = [] - got_favicons = False - - # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture - # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't - self.wait_for_download = 0 - - # We may have multiple download triggered via JS - multiple_downloads: list[tuple[str, bytes]] = [] + When `page` is not provided, this method creates and prepares a new page, + performs the navigation, and finalizes the capture before returning. + Recursive child captures reuse the existing page and therefore skip the + outer setup/finalization path. - async def handle_download(download: Download) -> None: - # This method is called when a download event is triggered from JS in a page that also renders - try: - self.wait_for_download += 1 - with NamedTemporaryFile() as tmp_f: - self.logger.info('Got a download triggered from JS.') - await download.save_as(tmp_f.name) - filename = download.suggested_filename - with open(tmp_f.name, "rb") as f: - file_content = f.read() - multiple_downloads.append((filename, file_content)) - self.logger.info('Done with download.') - except Exception as e: - if download.page.is_closed(): - # Page is closed, skip logging. - pass - else: - self.logger.warning(f'Unable to finish download triggered from JS: {e}') - finally: - self.wait_for_download -= 1 + When `current_page_only` is True the method snapshots the page as-is + (no navigation, no recursion) and then finalizes. This is the path + used by interactive captures after setup_page_capture has already been + called by the caller. + """ - async def store_request(request: Request) -> None: - # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL - if got_favicons or request.resource_type != 'image': - return - try: - if response := await request.response(): - if got_favicons: - return - if request.resource_type == 'image' and response.ok: - try: - if body := await response.body(): - m = self.magicdb.best_magic_buffer(body) - if m.mime_type.startswith('image'): - self._requests[request.url] = body - except Exception: - pass - except Exception as e: - self.logger.info(f'Unable to store request: {e}') + to_return: CaptureResponse = {} + errors: list[str] = [] - if page is not None: + if current_page_only: + if page is None: + raise InvalidPlaywrightParameter('current_page_only requires a page argument') + capturing_sub = False + elif page is not None: capturing_sub = True else: capturing_sub = False try: page = await self.context.new_page() - - if self.browser_name == 'chromium' and self.headless: - async def _override_content_disposition_handler(route: Route, request: Request) -> None: - """Special case to handle PDF rendered in the browser directly""" - try: - response = await route.fetch() # performs the request - overridden_headers = { - **response.headers, - "content-disposition": 'attachment' - } - self.logger.info('Got a PDF in headless chromium, force download') - await route.fulfill(response=response, headers=overridden_headers) - except Error as e: - self.logger.info(f'Unable to force download: {e}') - await route.continue_() - - # overwrite in chromium in headless mode, to trigger a download - # otherwise it is rendered in the PDF viewer. - try: - await page.route("**/*.pdf", handler=_override_content_disposition_handler) - except Error as e: - self.logger.warning(f'Failed at fetching PDF in headless chromium: {e}') - - # client = await page.context.new_cdp_session(page) - # await client.detach() except Error as e: self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}') self.should_retry = True to_return['error'] = f'Unable to create new page: {e}' return to_return - if allow_tracking: - # Add authorization clickthroughs - await self.__dialog_didomi_clickthrough(page) - await self.__dialog_onetrust_clickthrough(page) - await self.__dialog_hubspot_clickthrough(page) - await self.__dialog_cookiebot_clickthrough(page) - await self.__dialog_complianz_clickthrough(page) - await self.__dialog_yahoo_clickthrough(page) - await self.__dialog_ppms_clickthrough(page) - await self.__dialog_alert_dialog_clickthrough(page) - await self.__dialog_clickthrough(page) - await self.__dialog_tarteaucitron_clickthrough(page) - - page.set_default_timeout((self._capture_timeout - 2) * 1000) - # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher - page.on("requestfinished", store_request) - page.on("dialog", lambda dialog: dialog.accept()) + await self.setup_page_capture(page, allow_tracking=allow_tracking) try: - try: - page.on("download", handle_download) - await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '') - except Error as initial_error: - self._update_exceptions(initial_error) - # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download - if initial_error.name in ['Download is starting', 'net::ERR_ABORTED']: - # page.goto failed, but it triggered a download event. - # Let's re-trigger it. - try: - async with page.expect_download() as download_info: + if current_page_only: + # Snapshot the current page state without navigation or recursion. + try: + to_return['frames'] = await self.make_frame_tree(page.main_frame) + + if frames := to_return.get('frames'): + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u + + if 'html' in to_return and to_return['html'] is not None and with_favicon: + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if self._mark_favicons_done is not None: + self._mark_favicons_done() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons on current page: {e}') + + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) + + if captured_url := to_return.get('last_redirected_url'): + self._already_captured.add(captured_url) + else: + self._already_captured.add(page.url) + except PlaywrightTimeoutError as e: + errors.append(f"The capture took too long while capturing current page - {e.message}") + self.should_retry = True + except (asyncio.TimeoutError, TimeoutError): + errors.append("Something in the capture of the current page took too long") + self.should_retry = True + except TargetClosedError as e: + errors.append(f"The target was closed while capturing current page - {e}") + self.should_retry = True + except Error as e: + self._update_exceptions(e) + errors.append(e.message) + to_return['error_name'] = e.name + if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): + self.logger.info(f'Unable to process current page: {e.name}') + elif self._retry_network_error(e) or self._retry_browser_error(e): + self.logger.info(f'Issue while capturing current page (retrying): {e.message}') + errors.append(f'Issue while capturing current page: {e.message}') + self.should_retry = True + else: + self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') + except Exception as e: + errors.append(str(e)) + if str(e) in ['Connection closed while reading from the driver']: + self.logger.info(f'Issue while capturing current page (retrying): {e}') + errors.append(f'Issue while capturing current page: {e}') + self.should_retry = True + else: + raise e + else: + # Standard navigation + capture path. + assert url is not None + try: + await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '') + except Error as initial_error: + self._update_exceptions(initial_error) + # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download + if initial_error.name in ['Download is starting', 'net::ERR_ABORTED']: + # page.goto failed, but it triggered a download event. + # Let's re-trigger it. + try: + async with page.expect_download() as download_info: + try: + await page.goto(url, referer=referer if referer else '') + except Exception: + pass + with NamedTemporaryFile() as tmp_f: + download = await download_info.value + await download.save_as(tmp_f.name) + filename = download.suggested_filename + with open(tmp_f.name, "rb") as f: + file_content = f.read() + self._multiple_downloads.append((filename, file_content)) + except PlaywrightTimeoutError: + self.logger.debug('No download has been triggered.') + raise initial_error + except Error as e: try: - await page.goto(url, referer=referer if referer else '') + error_msg = download.failure() + if not error_msg: + raise e + errors.append(f"Error while downloading: {error_msg}") + self.logger.info(f'Error while downloading: {error_msg}') + self.should_retry = True except Exception: - pass - with NamedTemporaryFile() as tmp_f: - download = await download_info.value - await download.save_as(tmp_f.name) - filename = download.suggested_filename - with open(tmp_f.name, "rb") as f: - file_content = f.read() - multiple_downloads.append((filename, file_content)) - except PlaywrightTimeoutError: - self.logger.debug('No download has been triggered.') - raise initial_error - except Error as e: - try: - error_msg = download.failure() - if not error_msg: raise e - errors.append(f"Error while downloading: {error_msg}") - self.logger.info(f'Error while downloading: {error_msg}') - self.should_retry = True - except Exception: - raise e + else: + raise initial_error else: - raise initial_error - else: - await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded - try: - await page.bring_to_front() - self.logger.debug('Page moved to front.') - except Error as e: - self.logger.warning(f'Unable to bring the page to the front: {e}.') + await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded + try: + await page.bring_to_front() + self.logger.debug('Page moved to front.') + except Error as e: + self.logger.warning(f'Unable to bring the page to the front: {e}.') - try: - if self.headless: - await self.__instrumentation(page, url, allow_tracking, final_wait) - else: - self.logger.debug('Headed mode, skipping instrumentation.') - await self._wait_for_random_timeout(page, self._capture_timeout - 5) - except Exception as e: - self.logger.exception(f'Error during instrumentation: {e}') - - # ### -------------------------------------- - # NOTE 2025-11-12: disabling the offline setting as it doesn't seem - # to solve the issue with the frames, but causes some failure - # while getting the stored state - - # Pass browser to offline mode to get content and make screenshot - # await self.context.set_offline(True) - # await self._safe_wait(page, 5) - # self.logger.info('Browser offline.') - # Abort everything - # await page.route("**/*", lambda route: route.abort()) - # await self._safe_wait(page, 5) - - to_return['frames'] = await self.make_frame_tree(page.main_frame) - - # ### -------------------------------------- - - # The first content is what we call rendered HTML, keep it as-is - if frames := to_return.get('frames'): - if content := frames.get('content'): - to_return['html'] = content - if u := frames.get('url'): - if not u: - self.logger.error('Unable to get the URL of the main frame.') - u = '/!\\ Unknown /!\\' - to_return['last_redirected_url'] = u - - if 'html' in to_return and to_return['html'] is not None and with_favicon: - # We're probably (?) safe only looking for favicons in the main frame. - # TODO: check that? try: - to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) - got_favicons = True - except (TimeoutError, asyncio.TimeoutError) as e: - self.logger.warning(f'[Timeout] Unable to get favicons: {e}') + if self.headless: + await self.__instrumentation(page, url, allow_tracking, final_wait) + else: + self.logger.debug('Headed mode, skipping instrumentation.') + await self._wait_for_random_timeout(page, self._capture_timeout - 5) except Exception as e: - self.logger.warning(f'Unable to get favicons: {e}') - - if with_screenshot: - to_return['png'] = await self._failsafe_get_screenshot(page) - - # Keep that all the way down there in case the capture failed. - self._already_captured.add(url) - - if depth > 0 and to_return.get('html') and to_return['html']: - # TODO with children frames: - # 1. if the frame hasa URL, use that as base URL/referer for the subsequent captures - # 2. if it doesn't, the base URL is the url of the parent (which may or may not be the main frame) - if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only): - to_return['children'] = [] - depth -= 1 - total_urls = len(child_urls) - max_capture_time = max(int(max_depth_capture_time / total_urls), self._minimal_timeout) - max_captures = int(max_depth_capture_time / max_capture_time) - if max_captures < total_urls: - self.logger.warning(f'Attempting to capture URLs from {page.url} but there are too many ({total_urls}) to capture in too little time. Only capturing the first {max_captures} URLs in the page.') - if max_captures <= 0: - # We don't really have time for even one capture, but let's try anyway. - child_urls = child_urls[:1] - else: - child_urls = child_urls[:max_captures] - self.logger.info(f'Capturing children, {max_captures} URLs') - consecutive_errors = 0 - for index, url in enumerate(child_urls): - self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s') - start_time = time.time() - if page.is_closed(): - self.logger.info('Page is closed, unable to capture children.') - break - try: - async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first - child_capture = await self.capture_page( - url=url, referer=page.url, - page=page, depth=depth, - rendered_hostname_only=rendered_hostname_only, - max_depth_capture_time=max_capture_time, - with_screenshot=with_screenshot, - final_wait=final_wait) - if with_trusted_timestamps: - try: - await self._get_trusted_timestamps(child_capture) - except Exception as e: - self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.') - to_return['children'].append(child_capture) # type: ignore[union-attr] - except (TimeoutError, asyncio.TimeoutError): - self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.') - consecutive_errors += 1 - except Exception as e: - self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.') - consecutive_errors += 1 - else: - consecutive_errors = 0 - runtime = int(time.time() - start_time) - self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.') - - if consecutive_errors >= 5: - # if we have more than 5 consecutive errors, the capture is most probably broken, breaking. - self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.') - errors.append("Got more than 5 consecutive errors while capturing children") - self.should_retry = True - break + self.logger.exception(f'Error during instrumentation: {e}') + + # ### -------------------------------------- + # NOTE 2025-11-12: disabling the offline setting as it doesn't seem + # to solve the issue with the frames, but causes some failure + # while getting the stored state + + # Pass browser to offline mode to get content and make screenshot + # await self.context.set_offline(True) + # await self._safe_wait(page, 5) + # self.logger.info('Browser offline.') + # Abort everything + # await page.route("**/*", lambda route: route.abort()) + # await self._safe_wait(page, 5) + + to_return['frames'] = await self.make_frame_tree(page.main_frame) + + # ### -------------------------------------- + + # The first content is what we call rendered HTML, keep it as-is + if frames := to_return.get('frames'): + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u + + if 'html' in to_return and to_return['html'] is not None and with_favicon: + # We're probably (?) safe only looking for favicons in the main frame. + # TODO: check that? + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if self._mark_favicons_done is not None: + self._mark_favicons_done() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons: {e}') + + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) + + # Keep that all the way down there in case the capture failed. + self._already_captured.add(url) + + if depth > 0 and to_return.get('html') and to_return['html']: + # TODO with children frames: + # 1. if the frame hasa URL, use that as base URL/referer for the subsequent captures + # 2. if it doesn't, the base URL is the url of the parent (which may or may not be the main frame) + if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only): + to_return['children'] = [] + depth -= 1 + total_urls = len(child_urls) + max_capture_time = max(int(max_depth_capture_time / total_urls), self._minimal_timeout) + max_captures = int(max_depth_capture_time / max_capture_time) + if max_captures < total_urls: + self.logger.warning(f'Attempting to capture URLs from {page.url} but there are too many ({total_urls}) to capture in too little time. Only capturing the first {max_captures} URLs in the page.') + if max_captures <= 0: + # We don't really have time for even one capture, but let's try anyway. + child_urls = child_urls[:1] + else: + child_urls = child_urls[:max_captures] + self.logger.info(f'Capturing children, {max_captures} URLs') + consecutive_errors = 0 + for index, url in enumerate(child_urls): + self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s') + start_time = time.time() + if page.is_closed(): + self.logger.info('Page is closed, unable to capture children.') + break + try: + async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first + child_capture = await self.capture_page( + url=url, referer=page.url, + page=page, depth=depth, + rendered_hostname_only=rendered_hostname_only, + max_depth_capture_time=max_capture_time, + with_screenshot=with_screenshot, + final_wait=final_wait) + if with_trusted_timestamps: + try: + await self._get_trusted_timestamps(child_capture) + except Exception as e: + self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.') + to_return['children'].append(child_capture) # type: ignore[union-attr] + except (TimeoutError, asyncio.TimeoutError): + self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.') + consecutive_errors += 1 + except Exception as e: + self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.') + consecutive_errors += 1 + else: + consecutive_errors = 0 + runtime = int(time.time() - start_time) + self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.') + + if consecutive_errors >= 5: + # if we have more than 5 consecutive errors, the capture is most probably broken, breaking. + self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.') + errors.append("Got more than 5 consecutive errors while capturing children") + self.should_retry = True + break - try: - await page.go_back() - except PlaywrightTimeoutError: - self.logger.info('Go back timed out, it is probably not a big deal.') - except Exception as e: - self.logger.info(f'Unable to go back: {e}.') + try: + await page.go_back() + except PlaywrightTimeoutError: + self.logger.info('Go back timed out, it is probably not a big deal.') + except Exception as e: + self.logger.info(f'Unable to go back: {e}.') except PlaywrightTimeoutError as e: errors.append(f"The capture took too long - {e.message}") @@ -1286,101 +1535,14 @@ async def _override_content_disposition_handler(route: Route, request: Request) else: raise e finally: - self.logger.debug('Finishing up capture.') if not capturing_sub: - # Deduplicate list - if multiple_dls := set(multiple_downloads): - if len(multiple_dls) == 1: - dl = multiple_dls.pop() - to_return["downloaded_filename"] = dl[0] - to_return["downloaded_file"] = dl[1] - else: - # we have multiple downloads, making it a zip, make sure the filename is unique - mem_zip = BytesIO() - to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip' - with ZipFile(mem_zip, 'w') as z: - for i, f_details in enumerate(multiple_dls): - filename, file_content = f_details - z.writestr(f'{i}_{filename}', file_content) - to_return["downloaded_file"] = mem_zip.getvalue() - - try: - async with timeout(15): - # NOTE: Ignore type until we can use python 3.12 + only - # playwrightcapture.capture.SetCookieParam == playwright._impl._api_structures.SetCookieParam - to_return['cookies'] = await self.context.cookies() # type: ignore[typeddict-item] - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to get cookies (timeout).") - errors.append("Unable to get the cookies (timeout).") - self.should_retry = True - except Error as e: - self.logger.warning(f"Unable to get cookies: {e}") - errors.append(f'Unable to get the cookies: {e}') - self.should_retry = True - - try: - async with timeout(15): - to_return['storage'] = await self.context.storage_state(indexed_db=True) - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to get storage (timeout).") - errors.append("Unable to get the storage (timeout).") - self.should_retry = True - except Error as e: - self.logger.warning(f"Unable to get the storage: {e}") - errors.append(f'Unable to get the storage: {e}') - self.should_retry = True - try: - if not page.is_closed(): - try: - page.remove_listener("requestfinished", store_request) - await asyncio.sleep(1) - async with timeout(3): - await self.context.set_offline(True) - self.logger.debug('Page offline.') - except (TimeoutError, asyncio.TimeoutError): - self.logger.debug("Unable switch offline.") - - try: - async with timeout(5): - await page.close(reason="Closing the page because the capture finished.") - self.logger.debug('Page closed.') - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable close page.") - - async with timeout(30): - await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR - self.logger.debug('Context closed.') - with open(self._temp_harfile.name, 'rb') as _har: - to_return['har'] = orjson.loads(_har.read()) - self.logger.debug('Got HAR.') - - if (to_return.get('har') and self.proxy and self.proxy.get('server') - and self.proxy['server'].startswith('socks5')): - # Only if the capture was not done via a socks5 proxy - if har := to_return['har']: # Could be None - try: - async with timeout(120): - await self.socks5_resolver(har) - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to resolve all the IPs via the socks5 proxy.") - errors.append("Unable to resolve all the IPs via the socks5 proxy.") - self.should_retry = True - - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to close context at the end of the capture.") - errors.append("Unable to close context at the end of the capture.") - self.should_retry = True - except Exception as e: - self.logger.warning(f"Other exception while finishing up the capture: {e}.") - errors.append(f'Unable to generate HAR file: {e}') + await self._finalize_capture( + page=page, + to_return=to_return, + errors=errors, + with_trusted_timestamps=with_trusted_timestamps, + ) self.logger.debug('Capture done') - if errors: - to_return['error'] = '\n'.join(errors) - if with_trusted_timestamps: - try: - await self._get_trusted_timestamps(to_return) - except Exception as e: - self.logger.warning(f'Unable to get trusted timestamps: {e}') return to_return async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None: