From 4a3eb48fa8b7f7f10bd84d668601b05765550ef8 Mon Sep 17 00:00:00 2001 From: Gleb Nikonorov Date: Wed, 18 Nov 2020 13:52:30 -0500 Subject: [PATCH 1/4] one candidate solution --- update_index.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/update_index.py b/update_index.py index 7b8a129..4b8056d 100644 --- a/update_index.py +++ b/update_index.py @@ -17,8 +17,11 @@ """ import json import os +import re import sys +import time from distutils.version import LooseVersion +from xmlrpc.client import Fault from xmlrpc.client import ServerProxy INDEX_FILE_NAME = os.path.join(os.path.dirname(__file__), "index.json") @@ -26,6 +29,34 @@ BLACKLIST = {"pytest-nbsmoke"} +def get_releases_for_package(client, package_name): + """ + Get all release versions for package 'package_name' and return them to the caller + + :param client: xmlrpclib.ServerProxy + :param package_name: package name to search for + """ + versions = None + + fetched_releases = False + while not fetched_releases: + try: + versions = client.package_releases(package_name) + fetched_releases = True + except Fault as fault: + # The message is like: + # The action could not be performed because there were too many requests by the client. Limit may reset in 1 seconds. + #raise ValueError(fault.faultString) + regex_match = re.search('^.+Limit may reset in (\d+) seconds\.$', fault.faultString) + if regex_match is None: + raise fault + + sleep_amt = int(regex_match.group(1)) + time.sleep(sleep_amt) + + return versions + + def iter_plugins(client, blacklist, *, consider_classifier=True): """ Returns an iterator of (name, latest version, summary) from PyPI. @@ -36,9 +67,14 @@ def iter_plugins(client, blacklist, *, consider_classifier=True): # previously we used the more efficient "search" XMLRPC method, but # that stopped returning all results after a while package_names = [x for x in client.list_packages() if x.startswith("pytest-")] + package_names = package_names[1:50] # TEMP: for testing full set is way too large names_and_versions = {} + print("TEMP: Processing '{}' packages".format(len(package_names))) + counter = 0 for name in package_names: - versions = client.package_releases(name) + print("process package '{}'".format(counter)) + counter += 1 + versions = get_releases_for_package(client, name) if versions: # Package can exist without public releases names_and_versions[name] = max(versions, key=LooseVersion) From a66110514f537fc2f755b5b7cb7999b22f305c08 Mon Sep 17 00:00:00 2001 From: Gleb Nikonorov Date: Wed, 18 Nov 2020 18:31:51 -0500 Subject: [PATCH 2/4] finished solution --- update_index.py | 76 ++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/update_index.py b/update_index.py index 4b8056d..c35fc35 100644 --- a/update_index.py +++ b/update_index.py @@ -28,53 +28,69 @@ BLACKLIST = {"pytest-nbsmoke"} +class RateLimitedServerProxy: + def __init__(self, uri): + self._server_proxy = ServerProxy(uri) -def get_releases_for_package(client, package_name): - """ - Get all release versions for package 'package_name' and return them to the caller + def browse(self, classifiers): + return self._rate_limit_request(self._server_proxy.browse, [classifiers]) - :param client: xmlrpclib.ServerProxy - :param package_name: package name to search for - """ - versions = None + def list_packages(self): + return self._rate_limit_request(self._server_proxy.list_packages) + + def package_releases(self, package_name): + return self._rate_limit_request(self._server_proxy.package_releases, [package_name]) + + def release_data(self, name, version): + return self._rate_limit_request(self._server_proxy.release_data, [name, version]) + + def _rate_limit_request(self, request_method, args=None): + return_value = None + + fetched_releases = False + while not fetched_releases: + try: + if args is not None: + return_value = request_method(*args) + else: + return_value = request_method() + fetched_releases = True + except Fault as fault: + # If PyPI times us out, sleep and try again depending on the error message received + unandled_exception = True + + # The fault message is of form: + # The action could not be performed because there were too many requests by the client. Limit may reset in 1 seconds. + limit_reset_regex_match = re.search('^.+Limit may reset in (\d+) seconds\.$', fault.faultString) + if limit_reset_regex_match is not None: + sleep_amt = int(limit_reset_regex_match.group(1)) + time.sleep(sleep_amt) + unhandled_exception = False - fetched_releases = False - while not fetched_releases: - try: - versions = client.package_releases(package_name) - fetched_releases = True - except Fault as fault: - # The message is like: - # The action could not be performed because there were too many requests by the client. Limit may reset in 1 seconds. - #raise ValueError(fault.faultString) - regex_match = re.search('^.+Limit may reset in (\d+) seconds\.$', fault.faultString) - if regex_match is None: - raise fault + too_many_requests_regex_match = re.search('^.+The action could not be performed because there were too many requests by the client.$', fault.faultString) + if too_many_requests_regex_match is not None: + time.sleep(60) + unhandled_exception = False - sleep_amt = int(regex_match.group(1)) - time.sleep(sleep_amt) + if unhandled_exception: + raise fault - return versions + return return_value def iter_plugins(client, blacklist, *, consider_classifier=True): """ Returns an iterator of (name, latest version, summary) from PyPI. - :param client: xmlrpclib.ServerProxy + :param client: RateLimitedServerProxy :param search: package names to search for """ # previously we used the more efficient "search" XMLRPC method, but # that stopped returning all results after a while package_names = [x for x in client.list_packages() if x.startswith("pytest-")] - package_names = package_names[1:50] # TEMP: for testing full set is way too large names_and_versions = {} - print("TEMP: Processing '{}' packages".format(len(package_names))) - counter = 0 for name in package_names: - print("process package '{}'".format(counter)) - counter += 1 - versions = get_releases_for_package(client, name) + versions = client.package_releases(name) if versions: # Package can exist without public releases names_and_versions[name] = max(versions, key=LooseVersion) @@ -132,7 +148,7 @@ def write_plugins_index(file_name, plugins): def main(): - client = ServerProxy("https://pypi.org/pypi") + client = RateLimitedServerProxy("https://pypi.org/pypi") plugins = sorted(iter_plugins(client, BLACKLIST, consider_classifier=False)) if write_plugins_index(INDEX_FILE_NAME, plugins): From 3371705d0d514cccc8936f3437f2e1c4ac8e56aa Mon Sep 17 00:00:00 2001 From: Gleb Nikonorov Date: Wed, 18 Nov 2020 18:36:39 -0500 Subject: [PATCH 3/4] better comments --- update_index.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/update_index.py b/update_index.py index c35fc35..66f9a29 100644 --- a/update_index.py +++ b/update_index.py @@ -56,7 +56,7 @@ def _rate_limit_request(self, request_method, args=None): return_value = request_method() fetched_releases = True except Fault as fault: - # If PyPI times us out, sleep and try again depending on the error message received + # If PyPI errors due to too many requests, sleep and try again depending on the error message received unandled_exception = True # The fault message is of form: @@ -67,6 +67,8 @@ def _rate_limit_request(self, request_method, args=None): time.sleep(sleep_amt) unhandled_exception = False + # The fault message is of form: + # The action could not be performed because there were too many requests by the client. too_many_requests_regex_match = re.search('^.+The action could not be performed because there were too many requests by the client.$', fault.faultString) if too_many_requests_regex_match is not None: time.sleep(60) From 2d11d9d12e225be87c4c3cacb2e48fd0d5b5e9de Mon Sep 17 00:00:00 2001 From: Gleb Nikonorov Date: Thu, 19 Nov 2020 23:49:39 -0500 Subject: [PATCH 4/4] review feedback --- update_index.py | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/update_index.py b/update_index.py index 66f9a29..7bd93a7 100644 --- a/update_index.py +++ b/update_index.py @@ -28,56 +28,50 @@ BLACKLIST = {"pytest-nbsmoke"} + class RateLimitedServerProxy: def __init__(self, uri): self._server_proxy = ServerProxy(uri) def browse(self, classifiers): - return self._rate_limit_request(self._server_proxy.browse, [classifiers]) + return self._rate_limit_request(self._server_proxy.browse, classifiers) def list_packages(self): return self._rate_limit_request(self._server_proxy.list_packages) def package_releases(self, package_name): - return self._rate_limit_request(self._server_proxy.package_releases, [package_name]) + return self._rate_limit_request(self._server_proxy.package_releases, package_name) def release_data(self, name, version): - return self._rate_limit_request(self._server_proxy.release_data, [name, version]) - - def _rate_limit_request(self, request_method, args=None): - return_value = None + return self._rate_limit_request(self._server_proxy.release_data, name, version) - fetched_releases = False - while not fetched_releases: + def _rate_limit_request(self, request_method, *args): + while True: try: - if args is not None: - return_value = request_method(*args) - else: - return_value = request_method() - fetched_releases = True + return request_method(*args) except Fault as fault: # If PyPI errors due to too many requests, sleep and try again depending on the error message received - unandled_exception = True - # The fault message is of form: # The action could not be performed because there were too many requests by the client. Limit may reset in 1 seconds. - limit_reset_regex_match = re.search('^.+Limit may reset in (\d+) seconds\.$', fault.faultString) + limit_reset_regex_match = re.search( + r"^.+Limit may reset in (\d+) seconds\.$", fault.faultString + ) if limit_reset_regex_match is not None: sleep_amt = int(limit_reset_regex_match.group(1)) time.sleep(sleep_amt) - unhandled_exception = False + continue # The fault message is of form: # The action could not be performed because there were too many requests by the client. - too_many_requests_regex_match = re.search('^.+The action could not be performed because there were too many requests by the client.$', fault.faultString) + too_many_requests_regex_match = re.search( + "^.+The action could not be performed because there were too many requests by the client.$", + fault.faultString, + ) if too_many_requests_regex_match is not None: time.sleep(60) - unhandled_exception = False - - if unhandled_exception: - raise fault + continue - return return_value + raise def iter_plugins(client, blacklist, *, consider_classifier=True):