From 2e60231091272016ceee8cb3b610bb3a7b325bed Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Wed, 1 Apr 2026 13:36:17 +0100 Subject: [PATCH 1/8] refactor(capture): decompose capture_page into helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split `capture_page` into two helper functions to reduce duplication and make the capture lifecycle easier to follow: * `setup_page_capture()`: gathers per-page event wiring that was previously inline in `capture_page`. This includes download tracking, request-body storage for favicon extraction, dialog acceptance, and the headless-Chromium PDF workaround. Returns a `PageCaptureState` TypedDict so that the same state can be passed to finalization without relying on closure variables. * `_finalize_capture()`: consolidates post-navigation teardown formerly in `capture_page`’s `finally` block. This handles multiple-download deduplication/zip, cookie and storage collection, page/context shutdown, HAR loading, SOCKS5 IP resolution, and trusted-timestamp requests. `capture_page` now delegates to these helpers; observable behavior remains unchanged. The `PageCaptureState` TypedDict and the `Awaitable`/`Callable`/`Mapping` imports required for its annotations are added in this commit. No feature changes are included in this refactor. --- playwrightcapture/capture.py | 550 ++++++++++++++++++++++------------- 1 file changed, 346 insertions(+), 204 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 4fc191b..d767f64 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -18,7 +18,7 @@ from logging import LoggerAdapter, Logger from tempfile import NamedTemporaryFile from typing import Any, Literal, TYPE_CHECKING -from collections.abc import MutableMapping +from collections.abc import Awaitable, Callable, Mapping, MutableMapping from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit, parse_qs, unquote_plus from zipfile import ZipFile @@ -105,6 +105,14 @@ class CaptureResponse(TypedDict, total=False): potential_favicons: set[bytes] | None +class PageCaptureState(TypedDict): + """Per-page runtime state shared between setup and finalization.""" + + multiple_downloads: list[tuple[str, bytes]] + store_request: Callable[[Request], Awaitable[None]] + mark_favicons_done: Callable[[], None] + + class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg] """ Prepend log entry with the UUID of the capture @@ -153,8 +161,9 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, :param general_timeout_in_sec: The general timeout for the capture, including children. :param loglevel: Python loglevel :param uuid: The UUID of the capture. - :param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment. - :param init_script: An optional JavaScript that will be executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script + :param headless: Whether to run the browser in headless mode. Set to False only when a graphical environment is available. + :param init_script: An optional JavaScript executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script + :param tt_settings: Optional trusted-timestamp configuration used to timestamp capture artifacts. """ master_logger = logging.getLogger('playwrightcapture') master_logger.setLevel(loglevel) @@ -226,11 +235,13 @@ def __prepare_proxy_aiohttp(self, proxy: ProxySettings) -> str: return proxy['server'] async def __aenter__(self) -> Capture: - '''Launch the browser''' - # Ignore the fonts by the time we take the screenshot + """Launch Playwright and the configured browser for this capture.""" + + # Do not wait for webfonts before taking screenshots. # 2026-02-02: the environment is copied into the process when initialized, so we need to set it globally here, # and not in the method where we take the screenshot os.environ['PW_TEST_SCREENSHOT_NO_FONTS_READY'] = '1' + self.playwright = await async_playwright().start() if self.device_name: @@ -263,6 +274,7 @@ async def __aenter__(self) -> Capture: return self async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: + """Close browser resources and suppress exceptions like the upstream context manager.""" try: await self.browser.close(reason="Closing browser at the end of the capture.") @@ -283,6 +295,115 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return True + async def setup_page_capture(self, page: Page, *, allow_tracking: bool=False) -> PageCaptureState: + """Prepare a page for a single-page capture without changing capture semantics. + + This method preserves the existing per-page setup used by capture_page: + download tracking, request body storage for image responses, dialog + acceptance, and the PDF download workaround in headless Chromium. + Interactive sessions reuse it so the operator-driven session can still + finalize like a normal single-page capture later on. + """ + got_favicons = False + + # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture + # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't + self.wait_for_download = 0 + + # We may have multiple download triggered via JS + multiple_downloads: list[tuple[str, bytes]] = [] + + async def handle_download(download: Download) -> None: + # This method is called when a download event is triggered from JS in a page that also renders + try: + self.wait_for_download += 1 + with NamedTemporaryFile() as tmp_f: + self.logger.info('Got a download triggered from JS.') + await download.save_as(tmp_f.name) + filename = download.suggested_filename + with open(tmp_f.name, "rb") as f: + file_content = f.read() + multiple_downloads.append((filename, file_content)) + self.logger.info('Done with download.') + except Exception as e: + if download.page.is_closed(): + # Page is closed, skip logging. + pass + else: + self.logger.warning(f'Unable to finish download triggered from JS: {e}') + finally: + self.wait_for_download -= 1 + + async def store_request(request: Request) -> None: + # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL + if got_favicons or request.resource_type != 'image': + return + try: + if response := await request.response(): + if got_favicons: + return + if request.resource_type == 'image' and response.ok: + try: + if body := await response.body(): + m = self.magicdb.best_magic_buffer(body) + if m.mime_type.startswith('image'): + self._requests[request.url] = body + except Exception: + pass + except Exception as e: + self.logger.info(f'Unable to store request: {e}') + + def mark_favicons_done() -> None: + nonlocal got_favicons + got_favicons = True + + if self.browser_name == 'chromium' and self.headless: + async def _override_content_disposition_handler(route: Route, request: Request) -> None: + """Special case to handle PDF rendered in the browser directly""" + try: + response = await route.fetch() # performs the request + overridden_headers = { + **response.headers, + "content-disposition": 'attachment' + } + self.logger.info('Got a PDF in headless chromium, force download') + await route.fulfill(response=response, headers=overridden_headers) + except Error as e: + self.logger.info(f'Unable to force download: {e}') + await route.continue_() + + # overwrite in chromium in headless mode, to trigger a download + # otherwise it is rendered in the PDF viewer. + try: + await page.route("**/*.pdf", handler=_override_content_disposition_handler) + except Error as e: + self.logger.warning(f'Failed at fetching PDF in headless chromium: {e}') + + if allow_tracking: + # Add authorization clickthroughs + await self.__dialog_didomi_clickthrough(page) + await self.__dialog_onetrust_clickthrough(page) + await self.__dialog_hubspot_clickthrough(page) + await self.__dialog_cookiebot_clickthrough(page) + await self.__dialog_complianz_clickthrough(page) + await self.__dialog_yahoo_clickthrough(page) + await self.__dialog_ppms_clickthrough(page) + await self.__dialog_alert_dialog_clickthrough(page) + await self.__dialog_clickthrough(page) + await self.__dialog_tarteaucitron_clickthrough(page) + + page.set_default_timeout((self._capture_timeout - 2) * 1000) + # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher + page.on("requestfinished", store_request) + page.on("dialog", lambda dialog: dialog.accept()) + page.on("download", handle_download) + + return { + 'multiple_downloads': multiple_downloads, + 'store_request': store_request, + 'mark_favicons_done': mark_favicons_done, + } + @property def locale(self) -> str: return self._locale @@ -344,26 +465,74 @@ def geolocation(self, geolocation: dict[str, str | int | float] | None) -> None: def cookies(self) -> list[Cookie]: return self._cookies + def _coerce_cookie_mapping(self, cookie: object) -> Mapping[str, Any] | None: + """Normalize supported cookie payload shapes to a mapping. + + Accepts plain mappings, Pydantic-style models, and simple objects with + cookie attributes so older callers can keep passing their existing + cookie objects. + """ + if isinstance(cookie, Mapping): + return cookie + + model_dump = getattr(cookie, 'model_dump', None) + if callable(model_dump): + try: + dumped_cookie = model_dump(exclude_none=True) + except TypeError: + dumped_cookie = model_dump() + if isinstance(dumped_cookie, Mapping): + return dumped_cookie + + dict_method = getattr(cookie, 'dict', None) + if callable(dict_method): + try: + dumped_cookie = dict_method(exclude_none=True) + except TypeError: + dumped_cookie = dict_method() + if isinstance(dumped_cookie, Mapping): + return dumped_cookie + + cookie_name = getattr(cookie, 'name', None) + cookie_value = getattr(cookie, 'value', None) + if cookie_name is None or cookie_value is None: + return None + + normalized_cookie: dict[str, Any] = { + 'name': cookie_name, + 'value': cookie_value, + } + for optional_key in ('url', 'domain', 'path', 'expires', 'httpOnly', 'secure', 'sameSite', 'partitionKey'): + optional_value = getattr(cookie, optional_key, None) + if optional_value is not None: + normalized_cookie[optional_key] = optional_value + return normalized_cookie + @cookies.setter - def cookies(self, cookies: list[Cookie | dict[str, Any]] | None) -> None: + def cookies(self, cookies: list[Cookie | dict[str, Any] | object] | None) -> None: '''Cookies to send along to the initial request. + Accepts Playwright cookie dictionaries as well as model/object wrappers + exposing equivalent fields. + :param cookies: The cookies, in this format: https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-cookies ''' if not cookies: return - for cookie in cookies: - if not cookie: + for raw_cookie in cookies: + if not raw_cookie: + continue + if isinstance(raw_cookie, Cookie): + self._cookies.append(raw_cookie) continue - if isinstance(cookie, Cookie): - self._cookies.append(cookie) - elif isinstance(cookie, dict): - try: - self._cookies.append(Cookie.model_validate(cookie)) - except Exception as e: - self.logger.warning(f'Invalid cookie: {e}') - else: - # None, ignore - pass + + cookie = self._coerce_cookie_mapping(raw_cookie) + if cookie is None: + self.logger.warning(f'Ignoring unsupported cookie payload: {raw_cookie!r}') + continue + try: + self._cookies.append(Cookie.model_validate(cookie)) + except Exception as e: + self.logger.warning(f'Invalid cookie: {e}') @property def storage(self) -> StorageState: @@ -967,6 +1136,144 @@ async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, fi await self._safe_wait(page) self.logger.debug('Done with waiting.') + async def _finalize_capture( + self, + *, + page: Page, + store_request: Callable[[Request], Awaitable[None]] | None, + multiple_downloads: list[tuple[str, bytes]] | None, + to_return: CaptureResponse, + errors: list[str], + with_trusted_timestamps: bool, + ) -> None: + """Common finalization logic for captures (downloads, cookies, storage, HAR, socks5, timestamps). + + This helper centralizes the tail of a capture, which previously lived at the end + of capture_page. It is now also used by capture_current_page, consuming the + state returned by setup_page_capture when available, to avoid code duplication + while keeping single-page finalization behavior aligned. + """ + + self.logger.debug('Finishing up capture (helper).') + + # We may have multiple downloads triggered via JS; if so, deduplicate them and, + # when there is more than one, bundle them into a zip stored in-memory. + # This mirrors the behavior previously implemented at the end of capture_page. + if multiple_downloads is not None: + if multiple_dls := set(multiple_downloads): + if len(multiple_dls) == 1: + dl = multiple_dls.pop() + to_return["downloaded_filename"] = dl[0] + to_return["downloaded_file"] = dl[1] + else: + mem_zip = BytesIO() + to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip' + with ZipFile(mem_zip, 'w') as z: + for i, f_details in enumerate(multiple_dls): + filename, file_content = f_details + z.writestr(f'{i}_{filename}', file_content) + to_return["downloaded_file"] = mem_zip.getvalue() + + # Collect cookies from the context (may time out or fail depending on page state). + try: + async with timeout(15): + # NOTE: Ignore type until we can use python 3.12 + only + # playwrightcapture.capture.SetCookieParam == playwright._impl._api_structures.SetCookieParam + to_return['cookies'] = await self.context.cookies() # type: ignore[typeddict-item] + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to get cookies (timeout).") + errors.append("Unable to get the cookies (timeout).") + self.should_retry = True + except Error as e: + self.logger.warning(f"Unable to get cookies: {e}") + errors.append(f'Unable to get the cookies: {e}') + self.should_retry = True + + # Collect storage state, including IndexedDB, to capture the full browser state. + try: + async with timeout(15): + to_return['storage'] = await self.context.storage_state(indexed_db=True) + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to get storage (timeout).") + errors.append("Unable to get the storage (timeout).") + self.should_retry = True + except Error as e: + self.logger.warning(f"Unable to get the storage: {e}") + errors.append(f'Unable to get the storage: {e}') + self.should_retry = True + + try: + if not page.is_closed(): + # Remove request listener if we set one; best-effort only as it is + # primarily used for favicon extraction and should not break captures. + if store_request is not None: + try: + page.remove_listener("requestfinished", store_request) + except Exception: + # Best-effort only + pass + + try: + # Give in-flight operations a short grace period, then switch the + # context offline to stop further network activity before closing. + await asyncio.sleep(1) + async with timeout(3): + await self.context.set_offline(True) + self.logger.debug('Page offline.') + except (TimeoutError, asyncio.TimeoutError): + self.logger.debug("Unable switch offline.") + + try: + # Finally close the page itself; failures here are non-fatal but + # are logged to help debug flaky environments. + async with timeout(5): + await page.close(reason="Closing the page because the capture finished.") + self.logger.debug('Page closed.') + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable close page.") + + # Close the context to flush the HAR file to disk, then load it. + async with timeout(30): + await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR + self.logger.debug('Context closed.') + with open(self._temp_harfile.name, 'rb') as _har: + to_return['har'] = orjson.loads(_har.read()) + self.logger.debug('Got HAR.') + + # When using a socks5 proxy, post-process the HAR to resolve IPs via + # the proxy so the stored HAR contains addresses consistent with what + # the proxy saw. + if (to_return.get('har') and self.proxy and self.proxy.get('server') + and self.proxy['server'].startswith('socks5')): + if har := to_return['har']: # Could be None + try: + async with timeout(120): + await self.socks5_resolver(har) + except (TimeoutError, asyncio.TimeoutError): + self.logger.warning("Unable to resolve all the IPs via the socks5 proxy.") + errors.append("Unable to resolve all the IPs via the socks5 proxy.") + self.should_retry = True + + except (TimeoutError, asyncio.TimeoutError): + # If closing the context or generating the HAR takes too long, the + # capture is considered incomplete but we still return what we have. + self.logger.warning("Unable to close context at the end of the capture.") + errors.append("Unable to close context at the end of the capture.") + self.should_retry = True + except Exception as e: + # Any other unexpected failure while finalizing the capture is logged + # and surfaced as a generic HAR-generation error. + self.logger.warning(f"Other exception while finishing up the capture: {e}.") + errors.append(f'Unable to generate HAR file: {e}') + + if errors: + to_return['error'] = '\n'.join(errors) + if with_trusted_timestamps: + try: + await self._get_trusted_timestamps(to_return) + except Exception as e: + self.logger.warning(f'Unable to get trusted timestamps: {e}') + async def capture_page(self, url: str, *, max_depth_capture_time: int, referer: str | None=None, page: Page | None=None, depth: int=0, @@ -977,57 +1284,17 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, with_trusted_timestamps: bool=False, final_wait: int=5 ) -> CaptureResponse: + """Capture a URL and optionally recurse into child links. + + When `page` is not provided, this method creates and prepares a new page, + performs the navigation, and finalizes the capture before returning. + Recursive child captures reuse the existing page and therefore skip the + outer setup/finalization path. + """ to_return: CaptureResponse = {} errors: list[str] = [] - got_favicons = False - - # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture - # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't - self.wait_for_download = 0 - - # We may have multiple download triggered via JS - multiple_downloads: list[tuple[str, bytes]] = [] - - async def handle_download(download: Download) -> None: - # This method is called when a download event is triggered from JS in a page that also renders - try: - self.wait_for_download += 1 - with NamedTemporaryFile() as tmp_f: - self.logger.info('Got a download triggered from JS.') - await download.save_as(tmp_f.name) - filename = download.suggested_filename - with open(tmp_f.name, "rb") as f: - file_content = f.read() - multiple_downloads.append((filename, file_content)) - self.logger.info('Done with download.') - except Exception as e: - if download.page.is_closed(): - # Page is closed, skip logging. - pass - else: - self.logger.warning(f'Unable to finish download triggered from JS: {e}') - finally: - self.wait_for_download -= 1 - - async def store_request(request: Request) -> None: - # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL - if got_favicons or request.resource_type != 'image': - return - try: - if response := await request.response(): - if got_favicons: - return - if request.resource_type == 'image' and response.ok: - try: - if body := await response.body(): - m = self.magicdb.best_magic_buffer(body) - if m.mime_type.startswith('image'): - self._requests[request.url] = body - except Exception: - pass - except Exception as e: - self.logger.info(f'Unable to store request: {e}') + page_capture_state: PageCaptureState | None = None if page is not None: capturing_sub = True @@ -1035,58 +1302,16 @@ async def store_request(request: Request) -> None: capturing_sub = False try: page = await self.context.new_page() - - if self.browser_name == 'chromium' and self.headless: - async def _override_content_disposition_handler(route: Route, request: Request) -> None: - """Special case to handle PDF rendered in the browser directly""" - try: - response = await route.fetch() # performs the request - overridden_headers = { - **response.headers, - "content-disposition": 'attachment' - } - self.logger.info('Got a PDF in headless chromium, force download') - await route.fulfill(response=response, headers=overridden_headers) - except Error as e: - self.logger.info(f'Unable to force download: {e}') - await route.continue_() - - # overwrite in chromium in headless mode, to trigger a download - # otherwise it is rendered in the PDF viewer. - try: - await page.route("**/*.pdf", handler=_override_content_disposition_handler) - except Error as e: - self.logger.warning(f'Failed at fetching PDF in headless chromium: {e}') - - # client = await page.context.new_cdp_session(page) - # await client.detach() except Error as e: self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}') self.should_retry = True to_return['error'] = f'Unable to create new page: {e}' return to_return - if allow_tracking: - # Add authorization clickthroughs - await self.__dialog_didomi_clickthrough(page) - await self.__dialog_onetrust_clickthrough(page) - await self.__dialog_hubspot_clickthrough(page) - await self.__dialog_cookiebot_clickthrough(page) - await self.__dialog_complianz_clickthrough(page) - await self.__dialog_yahoo_clickthrough(page) - await self.__dialog_ppms_clickthrough(page) - await self.__dialog_alert_dialog_clickthrough(page) - await self.__dialog_clickthrough(page) - await self.__dialog_tarteaucitron_clickthrough(page) - - page.set_default_timeout((self._capture_timeout - 2) * 1000) - # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher - page.on("requestfinished", store_request) - page.on("dialog", lambda dialog: dialog.accept()) + page_capture_state = await self.setup_page_capture(page, allow_tracking=allow_tracking) try: try: - page.on("download", handle_download) await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '') except Error as initial_error: self._update_exceptions(initial_error) @@ -1106,7 +1331,8 @@ async def _override_content_disposition_handler(route: Route, request: Request) filename = download.suggested_filename with open(tmp_f.name, "rb") as f: file_content = f.read() - multiple_downloads.append((filename, file_content)) + if page_capture_state is not None: + page_capture_state['multiple_downloads'].append((filename, file_content)) except PlaywrightTimeoutError: self.logger.debug('No download has been triggered.') raise initial_error @@ -1171,7 +1397,8 @@ async def _override_content_disposition_handler(route: Route, request: Request) # TODO: check that? try: to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) - got_favicons = True + if page_capture_state is not None: + page_capture_state['mark_favicons_done']() except (TimeoutError, asyncio.TimeoutError) as e: self.logger.warning(f'[Timeout] Unable to get favicons: {e}') except Exception as e: @@ -1286,101 +1513,16 @@ async def _override_content_disposition_handler(route: Route, request: Request) else: raise e finally: - self.logger.debug('Finishing up capture.') if not capturing_sub: - # Deduplicate list - if multiple_dls := set(multiple_downloads): - if len(multiple_dls) == 1: - dl = multiple_dls.pop() - to_return["downloaded_filename"] = dl[0] - to_return["downloaded_file"] = dl[1] - else: - # we have multiple downloads, making it a zip, make sure the filename is unique - mem_zip = BytesIO() - to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip' - with ZipFile(mem_zip, 'w') as z: - for i, f_details in enumerate(multiple_dls): - filename, file_content = f_details - z.writestr(f'{i}_{filename}', file_content) - to_return["downloaded_file"] = mem_zip.getvalue() - - try: - async with timeout(15): - # NOTE: Ignore type until we can use python 3.12 + only - # playwrightcapture.capture.SetCookieParam == playwright._impl._api_structures.SetCookieParam - to_return['cookies'] = await self.context.cookies() # type: ignore[typeddict-item] - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to get cookies (timeout).") - errors.append("Unable to get the cookies (timeout).") - self.should_retry = True - except Error as e: - self.logger.warning(f"Unable to get cookies: {e}") - errors.append(f'Unable to get the cookies: {e}') - self.should_retry = True - - try: - async with timeout(15): - to_return['storage'] = await self.context.storage_state(indexed_db=True) - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to get storage (timeout).") - errors.append("Unable to get the storage (timeout).") - self.should_retry = True - except Error as e: - self.logger.warning(f"Unable to get the storage: {e}") - errors.append(f'Unable to get the storage: {e}') - self.should_retry = True - try: - if not page.is_closed(): - try: - page.remove_listener("requestfinished", store_request) - await asyncio.sleep(1) - async with timeout(3): - await self.context.set_offline(True) - self.logger.debug('Page offline.') - except (TimeoutError, asyncio.TimeoutError): - self.logger.debug("Unable switch offline.") - - try: - async with timeout(5): - await page.close(reason="Closing the page because the capture finished.") - self.logger.debug('Page closed.') - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable close page.") - - async with timeout(30): - await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR - self.logger.debug('Context closed.') - with open(self._temp_harfile.name, 'rb') as _har: - to_return['har'] = orjson.loads(_har.read()) - self.logger.debug('Got HAR.') - - if (to_return.get('har') and self.proxy and self.proxy.get('server') - and self.proxy['server'].startswith('socks5')): - # Only if the capture was not done via a socks5 proxy - if har := to_return['har']: # Could be None - try: - async with timeout(120): - await self.socks5_resolver(har) - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to resolve all the IPs via the socks5 proxy.") - errors.append("Unable to resolve all the IPs via the socks5 proxy.") - self.should_retry = True - - except (TimeoutError, asyncio.TimeoutError): - self.logger.warning("Unable to close context at the end of the capture.") - errors.append("Unable to close context at the end of the capture.") - self.should_retry = True - except Exception as e: - self.logger.warning(f"Other exception while finishing up the capture: {e}.") - errors.append(f'Unable to generate HAR file: {e}') + await self._finalize_capture( + page=page, + store_request=page_capture_state['store_request'] if page_capture_state is not None else None, + multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, + to_return=to_return, + errors=errors, + with_trusted_timestamps=with_trusted_timestamps, + ) self.logger.debug('Capture done') - if errors: - to_return['error'] = '\n'.join(errors) - if with_trusted_timestamps: - try: - await self._get_trusted_timestamps(to_return) - except Exception as e: - self.logger.warning(f'Unable to get trusted timestamps: {e}') return to_return async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None: From 8dfe0eef368799e677967d603a1f7266cd78ea59 Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:09:29 +0100 Subject: [PATCH 2/8] feat(capture): add display parameter and capture_current_page method `Capture.__init__` now accepts an optional `display` parameter (default `None`). When provided, `__aenter__` constructs a per-launch environment dictionary that overrides `DISPLAY`, allowing each concurrent interactive session to target its own X11 server without modifying the process-global `os.environ`. A new `capture_current_page()` method captures the current page state without navigating or recursing into child URLs. It reuses `setup_page_capture` (called by the caller before navigation) and the existing `_finalize_capture` workflow for cookies, storage, HAR, and trusted timestamps. This method serves as the final-capture step when an interactive session signals it is ready. --- playwrightcapture/capture.py | 126 ++++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 2 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index d767f64..7b7f973 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -151,7 +151,8 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, socks5_dns_resolver: str | list[str] | None=None, general_timeout_in_sec: int | None=None, loglevel: str | int='INFO', uuid: str | None=None, headless: bool=True, - *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None): + *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None, + display: str | None=None): """Captures a page with Playwright. :param browser: The browser to use for the capture. @@ -164,6 +165,7 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, :param headless: Whether to run the browser in headless mode. Set to False only when a graphical environment is available. :param init_script: An optional JavaScript executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script :param tt_settings: Optional trusted-timestamp configuration used to timestamp capture artifacts. + :param display: Optional X11 display passed to the browser subprocess. Used by interactive headed captures to isolate concurrent sessions. """ master_logger = logging.getLogger('playwrightcapture') master_logger.setLevel(loglevel) @@ -217,6 +219,9 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, self._init_script = init_script self.tt_settings = tt_settings + # X11 display to use for the browser subprocess. Passed via env so each + # concurrent capture gets its own display without mutating os.environ. + self._display = display # Initialize the magic DB self.magicdb = MagicDb() @@ -258,11 +263,20 @@ async def __aenter__(self) -> Capture: '--unsafely-treat-insecure-origin-as-secure', # Allows to run crypto API on .onion URLs (See https://github.com/Lookyloo/PlaywrightCapture/issues/65) ] + # Build a per-launch environment so concurrent captures each target + # their own X11 display without mutating the process-global DISPLAY. + launch_env: dict[str, str] | None = None + if self._display: + launch_env = {**os.environ, 'DISPLAY': self._display} + self.logger.info(f'Launching browser on DISPLAY {self._display}') + else: + self.logger.info(f'Launching browser on default DISPLAY {os.environ.get("DISPLAY", "")}') self.browser = await self.playwright[self.browser_name].launch( proxy=self.proxy if self.proxy else None, channel="chromium" if self.browser_name == "chromium" else None, args=args, - headless=self.headless + headless=self.headless, + env=launch_env, ) # Set of URLs that were captured in that context @@ -1525,6 +1539,114 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, self.logger.debug('Capture done') return to_return + async def capture_current_page( + self, + page: Page, + *, + rendered_hostname_only: bool = True, + with_screenshot: bool = True, + with_favicon: bool = False, + with_trusted_timestamps: bool = False, + page_capture_state: PageCaptureState | None = None, + ) -> CaptureResponse: + """Capture the state of the current page only. + + This method is the final-page path used by interactive captures. It does + not navigate, recurse into links, or perform crawler-style expansion. + It snapshots the page as it exists when called, then runs the normal + single-page finalization steps. If the caller already ran + setup_page_capture, pass its state so download and favicon bookkeeping are + finalized consistently. + """ + + to_return: CaptureResponse = {} + errors: list[str] = [] + + try: + # Build frame tree and extract main HTML / URL, similar to capture_page + to_return['frames'] = await self.make_frame_tree(page.main_frame) + + if frames := to_return.get('frames'): + # The first content is what we call rendered HTML, keep it as-is + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u + + if 'html' in to_return and to_return['html'] is not None and with_favicon: + # We're probably (?) safe only looking for favicons in the main frame. + # TODO: check that? + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if page_capture_state is not None: + page_capture_state['mark_favicons_done']() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons on current page: {e}') + + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) + + # Keep that all the way down there in case the capture failed. + if url := to_return.get('last_redirected_url'): + self._already_captured.add(url) + else: + self._already_captured.add(page.url) + + except PlaywrightTimeoutError as e: + errors.append(f"The capture took too long while capturing current page - {e.message}") + self.should_retry = True + except (asyncio.TimeoutError, TimeoutError): + errors.append("Something in the capture of the current page took too long") + self.should_retry = True + except TargetClosedError as e: + errors.append(f"The target was closed while capturing current page - {e}") + self.should_retry = True + except Error as e: + # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process. + # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time. + self._update_exceptions(e) + errors.append(e.message) + to_return['error_name'] = e.name + # TODO: check e.message and figure out if it is worth retrying or not. + # NOTE: e.name is generally (always?) "Error" + if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): + self.logger.info(f'Unable to process current page: {e.name}') + elif self._retry_network_error(e) or self._retry_browser_error(e): + # this one sounds like something we can retry... + self.logger.info(f'Issue while capturing current page (retrying): {e.message}') + errors.append(f'Issue while capturing current page: {e.message}') + self.should_retry = True + else: + # Unexpected ones + self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') + except Exception as e: + # we may get a non-playwright exception to. + # The ones we try to handle here should be treated as if they were. + errors.append(str(e)) + if str(e) in ['Connection closed while reading from the driver']: + self.logger.info(f'Issue while capturing current page (retrying): {e}') + errors.append(f'Issue while capturing current page: {e}') + self.should_retry = True + else: + raise e + + await self._finalize_capture( + page=page, + store_request=page_capture_state['store_request'] if page_capture_state is not None else None, + multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, + to_return=to_return, + errors=errors, + with_trusted_timestamps=with_trusted_timestamps, + ) + + self.logger.debug('Current-page capture done') + return to_return + async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None: """Get trusted timestamps for the relevant values in the response""" if not self.tt_settings: From 0b113ca6b480457160a25c6bc18be01d86e49dda Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:48:22 +0100 Subject: [PATCH 3/8] fix: correct datetime import --- playwrightcapture/capture.py | 1 + 1 file changed, 1 insertion(+) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 7b7f973..f347637 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -14,6 +14,7 @@ import time from base64 import b64decode, b64encode +from datetime import datetime from io import BytesIO from logging import LoggerAdapter, Logger from tempfile import NamedTemporaryFile From 18ca96ae158b017fa91f7e53ee82a2136c9abe47 Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Thu, 2 Apr 2026 08:27:05 +0100 Subject: [PATCH 4/8] fix(types): widen launch_env type to match Playwright BrowserType.launch signature The env parameter of BrowserType.launch() expects dict[str, str | float | bool] | None, not dict[str, str] | None. Widen the launch_env annotation to satisfy mypy --strict. --- playwrightcapture/capture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index f347637..f236e51 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -266,7 +266,7 @@ async def __aenter__(self) -> Capture: # Build a per-launch environment so concurrent captures each target # their own X11 display without mutating the process-global DISPLAY. - launch_env: dict[str, str] | None = None + launch_env: dict[str, str | float | bool] | None = None if self._display: launch_env = {**os.environ, 'DISPLAY': self._display} self.logger.info(f'Launching browser on DISPLAY {self._display}') From 6589d1228ea6a83d79863a84497a9deab5daeb78 Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Thu, 2 Apr 2026 08:55:58 +0100 Subject: [PATCH 5/8] style: normalize default arg spacing in capture_current_page Align keyword-only parameter defaults with the rest of the codebase (no spaces around '='). --- playwrightcapture/capture.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index f236e51..0e8611c 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -1544,11 +1544,11 @@ async def capture_current_page( self, page: Page, *, - rendered_hostname_only: bool = True, - with_screenshot: bool = True, - with_favicon: bool = False, - with_trusted_timestamps: bool = False, - page_capture_state: PageCaptureState | None = None, + rendered_hostname_only: bool=True, + with_screenshot: bool=True, + with_favicon: bool=False, + with_trusted_timestamps: bool=False, + page_capture_state: PageCaptureState | None=None, ) -> CaptureResponse: """Capture the state of the current page only. From d5507b9b16098e2ed3fa7b92e6a0a69e062b13a8 Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:05:37 +0100 Subject: [PATCH 6/8] fix: ensure _finalize_capture runs on re-raised exceptions in capture_current_page When the bare 'except Exception' handler re-raises, _finalize_capture was skipped because it was called sequentially after the try/except rather than in a finally block. This left the page/context unclosed and the HAR file unflushed. Wrap the try/except in an outer try/finally, matching the pattern already used by capture_page. --- playwrightcapture/capture.py | 149 ++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 74 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 0e8611c..7adb1ac 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -1564,86 +1564,87 @@ async def capture_current_page( errors: list[str] = [] try: - # Build frame tree and extract main HTML / URL, similar to capture_page - to_return['frames'] = await self.make_frame_tree(page.main_frame) + try: + # Build frame tree and extract main HTML / URL, similar to capture_page + to_return['frames'] = await self.make_frame_tree(page.main_frame) - if frames := to_return.get('frames'): - # The first content is what we call rendered HTML, keep it as-is - if content := frames.get('content'): - to_return['html'] = content - if u := frames.get('url'): - if not u: - self.logger.error('Unable to get the URL of the main frame.') - u = '/!\\ Unknown /!\\' - to_return['last_redirected_url'] = u - - if 'html' in to_return and to_return['html'] is not None and with_favicon: - # We're probably (?) safe only looking for favicons in the main frame. - # TODO: check that? - try: - to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) - if page_capture_state is not None: - page_capture_state['mark_favicons_done']() - except (TimeoutError, asyncio.TimeoutError) as e: - self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') - except Exception as e: - self.logger.warning(f'Unable to get favicons on current page: {e}') + if frames := to_return.get('frames'): + # The first content is what we call rendered HTML, keep it as-is + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u - if with_screenshot: - to_return['png'] = await self._failsafe_get_screenshot(page) + if 'html' in to_return and to_return['html'] is not None and with_favicon: + # We're probably (?) safe only looking for favicons in the main frame. + # TODO: check that? + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if page_capture_state is not None: + page_capture_state['mark_favicons_done']() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons on current page: {e}') - # Keep that all the way down there in case the capture failed. - if url := to_return.get('last_redirected_url'): - self._already_captured.add(url) - else: - self._already_captured.add(page.url) + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) - except PlaywrightTimeoutError as e: - errors.append(f"The capture took too long while capturing current page - {e.message}") - self.should_retry = True - except (asyncio.TimeoutError, TimeoutError): - errors.append("Something in the capture of the current page took too long") - self.should_retry = True - except TargetClosedError as e: - errors.append(f"The target was closed while capturing current page - {e}") - self.should_retry = True - except Error as e: - # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process. - # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time. - self._update_exceptions(e) - errors.append(e.message) - to_return['error_name'] = e.name - # TODO: check e.message and figure out if it is worth retrying or not. - # NOTE: e.name is generally (always?) "Error" - if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): - self.logger.info(f'Unable to process current page: {e.name}') - elif self._retry_network_error(e) or self._retry_browser_error(e): - # this one sounds like something we can retry... - self.logger.info(f'Issue while capturing current page (retrying): {e.message}') - errors.append(f'Issue while capturing current page: {e.message}') + # Keep that all the way down there in case the capture failed. + if url := to_return.get('last_redirected_url'): + self._already_captured.add(url) + else: + self._already_captured.add(page.url) + + except PlaywrightTimeoutError as e: + errors.append(f"The capture took too long while capturing current page - {e.message}") self.should_retry = True - else: - # Unexpected ones - self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') - except Exception as e: - # we may get a non-playwright exception to. - # The ones we try to handle here should be treated as if they were. - errors.append(str(e)) - if str(e) in ['Connection closed while reading from the driver']: - self.logger.info(f'Issue while capturing current page (retrying): {e}') - errors.append(f'Issue while capturing current page: {e}') + except (asyncio.TimeoutError, TimeoutError): + errors.append("Something in the capture of the current page took too long") self.should_retry = True - else: - raise e - - await self._finalize_capture( - page=page, - store_request=page_capture_state['store_request'] if page_capture_state is not None else None, - multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, - to_return=to_return, - errors=errors, - with_trusted_timestamps=with_trusted_timestamps, - ) + except TargetClosedError as e: + errors.append(f"The target was closed while capturing current page - {e}") + self.should_retry = True + except Error as e: + # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process. + # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time. + self._update_exceptions(e) + errors.append(e.message) + to_return['error_name'] = e.name + # TODO: check e.message and figure out if it is worth retrying or not. + # NOTE: e.name is generally (always?) "Error" + if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): + self.logger.info(f'Unable to process current page: {e.name}') + elif self._retry_network_error(e) or self._retry_browser_error(e): + # this one sounds like something we can retry... + self.logger.info(f'Issue while capturing current page (retrying): {e.message}') + errors.append(f'Issue while capturing current page: {e.message}') + self.should_retry = True + else: + # Unexpected ones + self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') + except Exception as e: + # we may get a non-playwright exception to. + # The ones we try to handle here should be treated as if they were. + errors.append(str(e)) + if str(e) in ['Connection closed while reading from the driver']: + self.logger.info(f'Issue while capturing current page (retrying): {e}') + errors.append(f'Issue while capturing current page: {e}') + self.should_retry = True + else: + raise e + finally: + await self._finalize_capture( + page=page, + store_request=page_capture_state['store_request'] if page_capture_state is not None else None, + multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, + to_return=to_return, + errors=errors, + with_trusted_timestamps=with_trusted_timestamps, + ) self.logger.debug('Current-page capture done') return to_return From b0b86d67622ac0736396a9a4adfe92bc8f04cb3c Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Sat, 4 Apr 2026 11:50:16 +0100 Subject: [PATCH 7/8] refactor(cookies): delegate coercion to Cookie.model_validate Remove `_coerce_cookie_mapping()` and rely on the `lookyloo-models` `Cookie` model to handle normalization. --- playwrightcapture/capture.py | 52 ++---------------------------------- 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index 7adb1ac..c6b9640 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -19,7 +19,7 @@ from logging import LoggerAdapter, Logger from tempfile import NamedTemporaryFile from typing import Any, Literal, TYPE_CHECKING -from collections.abc import Awaitable, Callable, Mapping, MutableMapping +from collections.abc import Awaitable, Callable, MutableMapping from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit, parse_qs, unquote_plus from zipfile import ZipFile @@ -480,49 +480,6 @@ def geolocation(self, geolocation: dict[str, str | int | float] | None) -> None: def cookies(self) -> list[Cookie]: return self._cookies - def _coerce_cookie_mapping(self, cookie: object) -> Mapping[str, Any] | None: - """Normalize supported cookie payload shapes to a mapping. - - Accepts plain mappings, Pydantic-style models, and simple objects with - cookie attributes so older callers can keep passing their existing - cookie objects. - """ - if isinstance(cookie, Mapping): - return cookie - - model_dump = getattr(cookie, 'model_dump', None) - if callable(model_dump): - try: - dumped_cookie = model_dump(exclude_none=True) - except TypeError: - dumped_cookie = model_dump() - if isinstance(dumped_cookie, Mapping): - return dumped_cookie - - dict_method = getattr(cookie, 'dict', None) - if callable(dict_method): - try: - dumped_cookie = dict_method(exclude_none=True) - except TypeError: - dumped_cookie = dict_method() - if isinstance(dumped_cookie, Mapping): - return dumped_cookie - - cookie_name = getattr(cookie, 'name', None) - cookie_value = getattr(cookie, 'value', None) - if cookie_name is None or cookie_value is None: - return None - - normalized_cookie: dict[str, Any] = { - 'name': cookie_name, - 'value': cookie_value, - } - for optional_key in ('url', 'domain', 'path', 'expires', 'httpOnly', 'secure', 'sameSite', 'partitionKey'): - optional_value = getattr(cookie, optional_key, None) - if optional_value is not None: - normalized_cookie[optional_key] = optional_value - return normalized_cookie - @cookies.setter def cookies(self, cookies: list[Cookie | dict[str, Any] | object] | None) -> None: '''Cookies to send along to the initial request. @@ -539,13 +496,8 @@ def cookies(self, cookies: list[Cookie | dict[str, Any] | object] | None) -> Non if isinstance(raw_cookie, Cookie): self._cookies.append(raw_cookie) continue - - cookie = self._coerce_cookie_mapping(raw_cookie) - if cookie is None: - self.logger.warning(f'Ignoring unsupported cookie payload: {raw_cookie!r}') - continue try: - self._cookies.append(Cookie.model_validate(cookie)) + self._cookies.append(Cookie.model_validate(raw_cookie)) except Exception as e: self.logger.warning(f'Invalid cookie: {e}') From 32d3a717119ecae583f08dfae8a5e60b7633f9ff Mon Sep 17 00:00:00 2001 From: Cormac Doherty <25778167+DocArmoryTech@users.noreply.github.com> Date: Sat, 4 Apr 2026 11:50:39 +0100 Subject: [PATCH 8/8] refactor(capture): move state to instance attrs & merge capture_current_page - Replace `PageCaptureState` `TypedDict` with instance attributes (`_multiple_downloads`, `_store_request`, `_mark_favicons_done`) on `Capture`. - `setup_page_capture()` now returns `None` and `_finalize_capture()` reads from `self`. - Make `url` optional and add `current_page_only` flag to `capture_page()` - when `True`, the method snapshots the page as-is without navigation. - Delete the now-redundant `capture_current_page()` method. --- playwrightcapture/capture.py | 548 ++++++++++++++++------------------- 1 file changed, 246 insertions(+), 302 deletions(-) diff --git a/playwrightcapture/capture.py b/playwrightcapture/capture.py index c6b9640..537196b 100644 --- a/playwrightcapture/capture.py +++ b/playwrightcapture/capture.py @@ -106,14 +106,6 @@ class CaptureResponse(TypedDict, total=False): potential_favicons: set[bytes] | None -class PageCaptureState(TypedDict): - """Per-page runtime state shared between setup and finalization.""" - - multiple_downloads: list[tuple[str, bytes]] - store_request: Callable[[Request], Awaitable[None]] - mark_favicons_done: Callable[[], None] - - class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg] """ Prepend log entry with the UUID of the capture @@ -224,6 +216,11 @@ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None, # concurrent capture gets its own display without mutating os.environ. self._display = display + # Per-page capture state populated by setup_page_capture(). + self._multiple_downloads: list[tuple[str, bytes]] = [] + self._store_request: Callable[[Request], Awaitable[None]] | None = None + self._mark_favicons_done: Callable[[], None] | None = None + # Initialize the magic DB self.magicdb = MagicDb() @@ -310,7 +307,7 @@ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool: return True - async def setup_page_capture(self, page: Page, *, allow_tracking: bool=False) -> PageCaptureState: + async def setup_page_capture(self, page: Page, *, allow_tracking: bool=False) -> None: """Prepare a page for a single-page capture without changing capture semantics. This method preserves the existing per-page setup used by capture_page: @@ -326,7 +323,7 @@ async def setup_page_capture(self, page: Page, *, allow_tracking: bool=False) -> self.wait_for_download = 0 # We may have multiple download triggered via JS - multiple_downloads: list[tuple[str, bytes]] = [] + self._multiple_downloads = [] async def handle_download(download: Download) -> None: # This method is called when a download event is triggered from JS in a page that also renders @@ -338,7 +335,7 @@ async def handle_download(download: Download) -> None: filename = download.suggested_filename with open(tmp_f.name, "rb") as f: file_content = f.read() - multiple_downloads.append((filename, file_content)) + self._multiple_downloads.append((filename, file_content)) self.logger.info('Done with download.') except Exception as e: if download.page.is_closed(): @@ -409,16 +406,13 @@ async def _override_content_disposition_handler(route: Route, request: Request) page.set_default_timeout((self._capture_timeout - 2) * 1000) # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher + self._store_request = store_request + self._mark_favicons_done = mark_favicons_done + page.on("requestfinished", store_request) page.on("dialog", lambda dialog: dialog.accept()) page.on("download", handle_download) - return { - 'multiple_downloads': multiple_downloads, - 'store_request': store_request, - 'mark_favicons_done': mark_favicons_done, - } - @property def locale(self) -> str: return self._locale @@ -483,8 +477,6 @@ def cookies(self) -> list[Cookie]: @cookies.setter def cookies(self, cookies: list[Cookie | dict[str, Any] | object] | None) -> None: '''Cookies to send along to the initial request. - Accepts Playwright cookie dictionaries as well as model/object wrappers - exposing equivalent fields. :param cookies: The cookies, in this format: https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-cookies ''' @@ -1107,27 +1099,19 @@ async def _finalize_capture( self, *, page: Page, - store_request: Callable[[Request], Awaitable[None]] | None, - multiple_downloads: list[tuple[str, bytes]] | None, to_return: CaptureResponse, errors: list[str], with_trusted_timestamps: bool, ) -> None: - """Common finalization logic for captures (downloads, cookies, storage, HAR, socks5, timestamps). - - This helper centralizes the tail of a capture, which previously lived at the end - of capture_page. It is now also used by capture_current_page, consuming the - state returned by setup_page_capture when available, to avoid code duplication - while keeping single-page finalization behavior aligned. - """ + """Common finalization logic for captures (downloads, cookies, storage, HAR, socks5, timestamps).""" self.logger.debug('Finishing up capture (helper).') # We may have multiple downloads triggered via JS; if so, deduplicate them and, # when there is more than one, bundle them into a zip stored in-memory. # This mirrors the behavior previously implemented at the end of capture_page. - if multiple_downloads is not None: - if multiple_dls := set(multiple_downloads): + if self._multiple_downloads: + if multiple_dls := set(self._multiple_downloads): if len(multiple_dls) == 1: dl = multiple_dls.pop() to_return["downloaded_filename"] = dl[0] @@ -1173,9 +1157,9 @@ async def _finalize_capture( if not page.is_closed(): # Remove request listener if we set one; best-effort only as it is # primarily used for favicon extraction and should not break captures. - if store_request is not None: + if self._store_request is not None: try: - page.remove_listener("requestfinished", store_request) + page.remove_listener("requestfinished", self._store_request) except Exception: # Best-effort only pass @@ -1241,7 +1225,7 @@ async def _finalize_capture( except Exception as e: self.logger.warning(f'Unable to get trusted timestamps: {e}') - async def capture_page(self, url: str, *, max_depth_capture_time: int, + async def capture_page(self, url: str | None=None, *, max_depth_capture_time: int, referer: str | None=None, page: Page | None=None, depth: int=0, rendered_hostname_only: bool=True, @@ -1249,6 +1233,7 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, with_favicon: bool=False, allow_tracking: bool=False, with_trusted_timestamps: bool=False, + current_page_only: bool=False, final_wait: int=5 ) -> CaptureResponse: """Capture a URL and optionally recurse into child links. @@ -1257,13 +1242,21 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, performs the navigation, and finalizes the capture before returning. Recursive child captures reuse the existing page and therefore skip the outer setup/finalization path. + + When `current_page_only` is True the method snapshots the page as-is + (no navigation, no recursion) and then finalizes. This is the path + used by interactive captures after setup_page_capture has already been + called by the caller. """ to_return: CaptureResponse = {} errors: list[str] = [] - page_capture_state: PageCaptureState | None = None - if page is not None: + if current_page_only: + if page is None: + raise InvalidPlaywrightParameter('current_page_only requires a page argument') + capturing_sub = False + elif page is not None: capturing_sub = True else: capturing_sub = False @@ -1275,172 +1268,234 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, to_return['error'] = f'Unable to create new page: {e}' return to_return - page_capture_state = await self.setup_page_capture(page, allow_tracking=allow_tracking) + await self.setup_page_capture(page, allow_tracking=allow_tracking) try: - try: - await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '') - except Error as initial_error: - self._update_exceptions(initial_error) - # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download - if initial_error.name in ['Download is starting', 'net::ERR_ABORTED']: - # page.goto failed, but it triggered a download event. - # Let's re-trigger it. - try: - async with page.expect_download() as download_info: + if current_page_only: + # Snapshot the current page state without navigation or recursion. + try: + to_return['frames'] = await self.make_frame_tree(page.main_frame) + + if frames := to_return.get('frames'): + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u + + if 'html' in to_return and to_return['html'] is not None and with_favicon: + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if self._mark_favicons_done is not None: + self._mark_favicons_done() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons on current page: {e}') + + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) + + if captured_url := to_return.get('last_redirected_url'): + self._already_captured.add(captured_url) + else: + self._already_captured.add(page.url) + except PlaywrightTimeoutError as e: + errors.append(f"The capture took too long while capturing current page - {e.message}") + self.should_retry = True + except (asyncio.TimeoutError, TimeoutError): + errors.append("Something in the capture of the current page took too long") + self.should_retry = True + except TargetClosedError as e: + errors.append(f"The target was closed while capturing current page - {e}") + self.should_retry = True + except Error as e: + self._update_exceptions(e) + errors.append(e.message) + to_return['error_name'] = e.name + if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): + self.logger.info(f'Unable to process current page: {e.name}') + elif self._retry_network_error(e) or self._retry_browser_error(e): + self.logger.info(f'Issue while capturing current page (retrying): {e.message}') + errors.append(f'Issue while capturing current page: {e.message}') + self.should_retry = True + else: + self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') + except Exception as e: + errors.append(str(e)) + if str(e) in ['Connection closed while reading from the driver']: + self.logger.info(f'Issue while capturing current page (retrying): {e}') + errors.append(f'Issue while capturing current page: {e}') + self.should_retry = True + else: + raise e + else: + # Standard navigation + capture path. + assert url is not None + try: + await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '') + except Error as initial_error: + self._update_exceptions(initial_error) + # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download + if initial_error.name in ['Download is starting', 'net::ERR_ABORTED']: + # page.goto failed, but it triggered a download event. + # Let's re-trigger it. + try: + async with page.expect_download() as download_info: + try: + await page.goto(url, referer=referer if referer else '') + except Exception: + pass + with NamedTemporaryFile() as tmp_f: + download = await download_info.value + await download.save_as(tmp_f.name) + filename = download.suggested_filename + with open(tmp_f.name, "rb") as f: + file_content = f.read() + self._multiple_downloads.append((filename, file_content)) + except PlaywrightTimeoutError: + self.logger.debug('No download has been triggered.') + raise initial_error + except Error as e: try: - await page.goto(url, referer=referer if referer else '') + error_msg = download.failure() + if not error_msg: + raise e + errors.append(f"Error while downloading: {error_msg}") + self.logger.info(f'Error while downloading: {error_msg}') + self.should_retry = True except Exception: - pass - with NamedTemporaryFile() as tmp_f: - download = await download_info.value - await download.save_as(tmp_f.name) - filename = download.suggested_filename - with open(tmp_f.name, "rb") as f: - file_content = f.read() - if page_capture_state is not None: - page_capture_state['multiple_downloads'].append((filename, file_content)) - except PlaywrightTimeoutError: - self.logger.debug('No download has been triggered.') - raise initial_error - except Error as e: - try: - error_msg = download.failure() - if not error_msg: raise e - errors.append(f"Error while downloading: {error_msg}") - self.logger.info(f'Error while downloading: {error_msg}') - self.should_retry = True - except Exception: - raise e + else: + raise initial_error else: - raise initial_error - else: - await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded - try: - await page.bring_to_front() - self.logger.debug('Page moved to front.') - except Error as e: - self.logger.warning(f'Unable to bring the page to the front: {e}.') + await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded + try: + await page.bring_to_front() + self.logger.debug('Page moved to front.') + except Error as e: + self.logger.warning(f'Unable to bring the page to the front: {e}.') - try: - if self.headless: - await self.__instrumentation(page, url, allow_tracking, final_wait) - else: - self.logger.debug('Headed mode, skipping instrumentation.') - await self._wait_for_random_timeout(page, self._capture_timeout - 5) - except Exception as e: - self.logger.exception(f'Error during instrumentation: {e}') - - # ### -------------------------------------- - # NOTE 2025-11-12: disabling the offline setting as it doesn't seem - # to solve the issue with the frames, but causes some failure - # while getting the stored state - - # Pass browser to offline mode to get content and make screenshot - # await self.context.set_offline(True) - # await self._safe_wait(page, 5) - # self.logger.info('Browser offline.') - # Abort everything - # await page.route("**/*", lambda route: route.abort()) - # await self._safe_wait(page, 5) - - to_return['frames'] = await self.make_frame_tree(page.main_frame) - - # ### -------------------------------------- - - # The first content is what we call rendered HTML, keep it as-is - if frames := to_return.get('frames'): - if content := frames.get('content'): - to_return['html'] = content - if u := frames.get('url'): - if not u: - self.logger.error('Unable to get the URL of the main frame.') - u = '/!\\ Unknown /!\\' - to_return['last_redirected_url'] = u - - if 'html' in to_return and to_return['html'] is not None and with_favicon: - # We're probably (?) safe only looking for favicons in the main frame. - # TODO: check that? try: - to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) - if page_capture_state is not None: - page_capture_state['mark_favicons_done']() - except (TimeoutError, asyncio.TimeoutError) as e: - self.logger.warning(f'[Timeout] Unable to get favicons: {e}') + if self.headless: + await self.__instrumentation(page, url, allow_tracking, final_wait) + else: + self.logger.debug('Headed mode, skipping instrumentation.') + await self._wait_for_random_timeout(page, self._capture_timeout - 5) except Exception as e: - self.logger.warning(f'Unable to get favicons: {e}') - - if with_screenshot: - to_return['png'] = await self._failsafe_get_screenshot(page) - - # Keep that all the way down there in case the capture failed. - self._already_captured.add(url) - - if depth > 0 and to_return.get('html') and to_return['html']: - # TODO with children frames: - # 1. if the frame hasa URL, use that as base URL/referer for the subsequent captures - # 2. if it doesn't, the base URL is the url of the parent (which may or may not be the main frame) - if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only): - to_return['children'] = [] - depth -= 1 - total_urls = len(child_urls) - max_capture_time = max(int(max_depth_capture_time / total_urls), self._minimal_timeout) - max_captures = int(max_depth_capture_time / max_capture_time) - if max_captures < total_urls: - self.logger.warning(f'Attempting to capture URLs from {page.url} but there are too many ({total_urls}) to capture in too little time. Only capturing the first {max_captures} URLs in the page.') - if max_captures <= 0: - # We don't really have time for even one capture, but let's try anyway. - child_urls = child_urls[:1] - else: - child_urls = child_urls[:max_captures] - self.logger.info(f'Capturing children, {max_captures} URLs') - consecutive_errors = 0 - for index, url in enumerate(child_urls): - self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s') - start_time = time.time() - if page.is_closed(): - self.logger.info('Page is closed, unable to capture children.') - break - try: - async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first - child_capture = await self.capture_page( - url=url, referer=page.url, - page=page, depth=depth, - rendered_hostname_only=rendered_hostname_only, - max_depth_capture_time=max_capture_time, - with_screenshot=with_screenshot, - final_wait=final_wait) - if with_trusted_timestamps: - try: - await self._get_trusted_timestamps(child_capture) - except Exception as e: - self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.') - to_return['children'].append(child_capture) # type: ignore[union-attr] - except (TimeoutError, asyncio.TimeoutError): - self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.') - consecutive_errors += 1 - except Exception as e: - self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.') - consecutive_errors += 1 - else: - consecutive_errors = 0 - runtime = int(time.time() - start_time) - self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.') - - if consecutive_errors >= 5: - # if we have more than 5 consecutive errors, the capture is most probably broken, breaking. - self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.') - errors.append("Got more than 5 consecutive errors while capturing children") - self.should_retry = True - break + self.logger.exception(f'Error during instrumentation: {e}') - try: - await page.go_back() - except PlaywrightTimeoutError: - self.logger.info('Go back timed out, it is probably not a big deal.') - except Exception as e: - self.logger.info(f'Unable to go back: {e}.') + # ### -------------------------------------- + # NOTE 2025-11-12: disabling the offline setting as it doesn't seem + # to solve the issue with the frames, but causes some failure + # while getting the stored state + + # Pass browser to offline mode to get content and make screenshot + # await self.context.set_offline(True) + # await self._safe_wait(page, 5) + # self.logger.info('Browser offline.') + # Abort everything + # await page.route("**/*", lambda route: route.abort()) + # await self._safe_wait(page, 5) + + to_return['frames'] = await self.make_frame_tree(page.main_frame) + + # ### -------------------------------------- + + # The first content is what we call rendered HTML, keep it as-is + if frames := to_return.get('frames'): + if content := frames.get('content'): + to_return['html'] = content + if u := frames.get('url'): + if not u: + self.logger.error('Unable to get the URL of the main frame.') + u = '/!\\ Unknown /!\\' + to_return['last_redirected_url'] = u + + if 'html' in to_return and to_return['html'] is not None and with_favicon: + # We're probably (?) safe only looking for favicons in the main frame. + # TODO: check that? + try: + to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) + if self._mark_favicons_done is not None: + self._mark_favicons_done() + except (TimeoutError, asyncio.TimeoutError) as e: + self.logger.warning(f'[Timeout] Unable to get favicons: {e}') + except Exception as e: + self.logger.warning(f'Unable to get favicons: {e}') + + if with_screenshot: + to_return['png'] = await self._failsafe_get_screenshot(page) + + # Keep that all the way down there in case the capture failed. + self._already_captured.add(url) + + if depth > 0 and to_return.get('html') and to_return['html']: + # TODO with children frames: + # 1. if the frame hasa URL, use that as base URL/referer for the subsequent captures + # 2. if it doesn't, the base URL is the url of the parent (which may or may not be the main frame) + if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only): + to_return['children'] = [] + depth -= 1 + total_urls = len(child_urls) + max_capture_time = max(int(max_depth_capture_time / total_urls), self._minimal_timeout) + max_captures = int(max_depth_capture_time / max_capture_time) + if max_captures < total_urls: + self.logger.warning(f'Attempting to capture URLs from {page.url} but there are too many ({total_urls}) to capture in too little time. Only capturing the first {max_captures} URLs in the page.') + if max_captures <= 0: + # We don't really have time for even one capture, but let's try anyway. + child_urls = child_urls[:1] + else: + child_urls = child_urls[:max_captures] + self.logger.info(f'Capturing children, {max_captures} URLs') + consecutive_errors = 0 + for index, url in enumerate(child_urls): + self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s') + start_time = time.time() + if page.is_closed(): + self.logger.info('Page is closed, unable to capture children.') + break + try: + async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first + child_capture = await self.capture_page( + url=url, referer=page.url, + page=page, depth=depth, + rendered_hostname_only=rendered_hostname_only, + max_depth_capture_time=max_capture_time, + with_screenshot=with_screenshot, + final_wait=final_wait) + if with_trusted_timestamps: + try: + await self._get_trusted_timestamps(child_capture) + except Exception as e: + self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.') + to_return['children'].append(child_capture) # type: ignore[union-attr] + except (TimeoutError, asyncio.TimeoutError): + self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.') + consecutive_errors += 1 + except Exception as e: + self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.') + consecutive_errors += 1 + else: + consecutive_errors = 0 + runtime = int(time.time() - start_time) + self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.') + + if consecutive_errors >= 5: + # if we have more than 5 consecutive errors, the capture is most probably broken, breaking. + self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.') + errors.append("Got more than 5 consecutive errors while capturing children") + self.should_retry = True + break + + try: + await page.go_back() + except PlaywrightTimeoutError: + self.logger.info('Go back timed out, it is probably not a big deal.') + except Exception as e: + self.logger.info(f'Unable to go back: {e}.') except PlaywrightTimeoutError as e: errors.append(f"The capture took too long - {e.message}") @@ -1483,8 +1538,6 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, if not capturing_sub: await self._finalize_capture( page=page, - store_request=page_capture_state['store_request'] if page_capture_state is not None else None, - multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, to_return=to_return, errors=errors, with_trusted_timestamps=with_trusted_timestamps, @@ -1492,115 +1545,6 @@ async def capture_page(self, url: str, *, max_depth_capture_time: int, self.logger.debug('Capture done') return to_return - async def capture_current_page( - self, - page: Page, - *, - rendered_hostname_only: bool=True, - with_screenshot: bool=True, - with_favicon: bool=False, - with_trusted_timestamps: bool=False, - page_capture_state: PageCaptureState | None=None, - ) -> CaptureResponse: - """Capture the state of the current page only. - - This method is the final-page path used by interactive captures. It does - not navigate, recurse into links, or perform crawler-style expansion. - It snapshots the page as it exists when called, then runs the normal - single-page finalization steps. If the caller already ran - setup_page_capture, pass its state so download and favicon bookkeeping are - finalized consistently. - """ - - to_return: CaptureResponse = {} - errors: list[str] = [] - - try: - try: - # Build frame tree and extract main HTML / URL, similar to capture_page - to_return['frames'] = await self.make_frame_tree(page.main_frame) - - if frames := to_return.get('frames'): - # The first content is what we call rendered HTML, keep it as-is - if content := frames.get('content'): - to_return['html'] = content - if u := frames.get('url'): - if not u: - self.logger.error('Unable to get the URL of the main frame.') - u = '/!\\ Unknown /!\\' - to_return['last_redirected_url'] = u - - if 'html' in to_return and to_return['html'] is not None and with_favicon: - # We're probably (?) safe only looking for favicons in the main frame. - # TODO: check that? - try: - to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html']) - if page_capture_state is not None: - page_capture_state['mark_favicons_done']() - except (TimeoutError, asyncio.TimeoutError) as e: - self.logger.warning(f'[Timeout] Unable to get favicons on current page: {e}') - except Exception as e: - self.logger.warning(f'Unable to get favicons on current page: {e}') - - if with_screenshot: - to_return['png'] = await self._failsafe_get_screenshot(page) - - # Keep that all the way down there in case the capture failed. - if url := to_return.get('last_redirected_url'): - self._already_captured.add(url) - else: - self._already_captured.add(page.url) - - except PlaywrightTimeoutError as e: - errors.append(f"The capture took too long while capturing current page - {e.message}") - self.should_retry = True - except (asyncio.TimeoutError, TimeoutError): - errors.append("Something in the capture of the current page took too long") - self.should_retry = True - except TargetClosedError as e: - errors.append(f"The target was closed while capturing current page - {e}") - self.should_retry = True - except Error as e: - # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process. - # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time. - self._update_exceptions(e) - errors.append(e.message) - to_return['error_name'] = e.name - # TODO: check e.message and figure out if it is worth retrying or not. - # NOTE: e.name is generally (always?) "Error" - if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e): - self.logger.info(f'Unable to process current page: {e.name}') - elif self._retry_network_error(e) or self._retry_browser_error(e): - # this one sounds like something we can retry... - self.logger.info(f'Issue while capturing current page (retrying): {e.message}') - errors.append(f'Issue while capturing current page: {e.message}') - self.should_retry = True - else: - # Unexpected ones - self.logger.exception(f'Something went poorly while capturing current page: "{e.name}" - {e.message}') - except Exception as e: - # we may get a non-playwright exception to. - # The ones we try to handle here should be treated as if they were. - errors.append(str(e)) - if str(e) in ['Connection closed while reading from the driver']: - self.logger.info(f'Issue while capturing current page (retrying): {e}') - errors.append(f'Issue while capturing current page: {e}') - self.should_retry = True - else: - raise e - finally: - await self._finalize_capture( - page=page, - store_request=page_capture_state['store_request'] if page_capture_state is not None else None, - multiple_downloads=page_capture_state['multiple_downloads'] if page_capture_state is not None else None, - to_return=to_return, - errors=errors, - with_trusted_timestamps=with_trusted_timestamps, - ) - - self.logger.debug('Current-page capture done') - return to_return - async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None: """Get trusted timestamps for the relevant values in the response""" if not self.tt_settings: