From 027b86dc7e2c573693b7cb83e9e65acbcae8828a Mon Sep 17 00:00:00 2001 From: Alexander Lohnau Date: Wed, 26 Jun 2024 20:55:46 +0200 Subject: [PATCH] Fix performance issues when having remote schema with multiple refs We would request the refs without any caching on disk and also do the network request for each file that is loaded. Instead, also cache refs of the given schema. To avoid more HTTP overhead, cached schemas are not revalidated when they were loaded/revalidated in the same process. Before: ``` time check-jsonschema --verbose --schemafile https://autoconfig.kde.org/jsonschemas/_combined.schema.json plugins/**.json ok -- validation done The following files were checked: .... plugins/virtualmonitor/kdeconnect_virtualmonitor.json ________________________________________________________ Executed in 31.14 secs fish external usr time 19.73 secs 5.08 millis 19.72 secs sys time 0.28 secs 0.01 millis 0.28 secs ``` After: ``` time python3 ~/projects/check-jsonschema/src/check_jsonschema/__main__.py --verbose --schemafile https://autoconfig.kde.org/jsonschemas/_combined.schema.json plugins /**.json ok -- validation done The following files were checked: ... plugins/virtualmonitor/kdeconnect_virtualmonitor.json ________________________________________________________ Executed in 2.23 secs fish external usr time 1.43 secs 4.72 millis 1.43 secs sys time 0.08 secs 0.09 millis 0.08 secs ``` --- src/check_jsonschema/cachedownloader.py | 23 +++++++++++++++---- .../schema_loader/resolver.py | 10 ++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index a31913097..935294688 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -24,6 +24,9 @@ class CacheDownloader: # this will let us do any other caching we might need in the future in the same # cache dir (adjacent to "downloads") _CACHEDIR_NAME = os.path.join("check_jsonschema", "downloads") + # Keep list of newly loaded/revalidated schemas in memory to avoid network requests + # Especially useful for schemas making extensive use of refs to remote URLs + _DOWNLOADED_URIS: set[str] = set() def __init__( self, @@ -113,10 +116,14 @@ def _write(self, dest: str, response: requests.Response) -> None: shutil.copy(fp.name, dest) os.remove(fp.name) - def _download(self) -> str: + def _cachefile_path(self) -> str: assert self._cache_dir os.makedirs(self._cache_dir, exist_ok=True) - dest = os.path.join(self._cache_dir, self._filename) + return os.path.join(self._cache_dir, self._filename) + + def _download(self) -> str: + dest = self._cachefile_path() + CacheDownloader._DOWNLOADED_URIS.add(self._file_url) response = self._get_request() # check to see if we have a file which matches the connection @@ -130,6 +137,14 @@ def _download(self) -> str: def open(self) -> t.Iterator[t.IO[bytes]]: if (not self._cache_dir) or self._disable_cache: yield io.BytesIO(self._get_request().content) + else: - with open(self._download(), "rb") as fp: - yield fp + cachefile = self._cachefile_path() + if self._file_url in CacheDownloader._DOWNLOADED_URIS and os.path.exists( + cachefile + ): + with open(cachefile, "rb") as fp: + yield fp + else: + with open(self._download(), "rb") as fp: + yield fp diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index 1ad1248df..8cda9a730 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -4,9 +4,10 @@ import urllib.parse import referencing -import requests from referencing.jsonschema import DRAFT202012, Schema +from check_jsonschema.cachedownloader import CacheDownloader + from ..parsers import ParserSet from ..utils import filename2path @@ -62,10 +63,9 @@ def retrieve_reference(uri: str) -> referencing.Resource[Schema]: full_uri_scheme = urllib.parse.urlsplit(full_uri).scheme if full_uri_scheme in ("http", "https"): - data = requests.get(full_uri, stream=True) - parsed_object = parser_set.parse_data_with_path( - data.content, full_uri, "json" - ) + dwl = CacheDownloader(full_uri) + with dwl.open() as file: + parsed_object = parser_set.parse_data_with_path(file, full_uri, "json") else: parsed_object = get_local_file(full_uri)