From 99477132125e3ce6f16264d518f30eb9d2cd0283 Mon Sep 17 00:00:00 2001 From: InertiaUK Date: Mon, 27 Apr 2026 17:08:21 +0100 Subject: [PATCH] fix: MidSuffolkDistrictCouncil - replace sleep with WebDriverWait, fix date parsing, improve address matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit time.sleep(5) after clicking Find Address held Chrome idle at peak memory while the parallel Grid+local Chrome paths overlapped — causing the #17 __clone OOM crash in the Selenium Grid container. The dropdown populates in ~0.4s in practice, so the 5s sleep was pure idle contention. Changes: - Replace sleep(5) with WebDriverWait polling for a populated beats a fixed sleep — the dropdown typically + # renders in <1s, so a 5s sleep just held Chrome idle and let the + # parallel Grid+local Chromes overlap memory peaks. On a busy + # Selenium Grid that overlap is what triggered the + # `__clone` / OOM crash inside the Grid container. Now we exit as + # soon as options are available. + def _populated_select(d): + selects = d.find_elements(By.CSS_SELECTOR, "select") + for s in selects: + if len(s.find_elements(By.TAG_NAME, "option")) > 1: + return s + return False + + select_address_input = WebDriverWait(driver, 30).until(_populated_select) + + # Select address based on postcode and house number. Iterate and + # prefer an exact-prefix match over substring matches so e.g. + # "ANNEXE 91 THE COMMON" doesn't beat "91 THE COMMON" when the + # caller asked for house number 91. select = Select(select_address_input) - selected = False + postcode_upper = user_postcode.upper() + paon_str = str(user_paon).upper() + best_value = None + best_priority = 99 for addr_option in select.options: - if not addr_option.text or addr_option.text == "Please Select...": + if not addr_option.text or addr_option.text.strip() == "": continue - option_text = addr_option.text.upper() - postcode_upper = user_postcode.upper() - paon_str = str(user_paon).upper() + if postcode_upper not in option_text: + continue - # Check if this option contains both postcode and house number - if postcode_upper in option_text and ( - f"{paon_str} " in option_text - or f", {paon_str}," in option_text + if option_text.startswith(f"{paon_str} "): + priority = 0 + elif ( + f", {paon_str}," in option_text or f", {paon_str} " in option_text - or f", {paon_str}A," in option_text or option_text.endswith(f", {paon_str}") ): - select.select_by_value(addr_option.get_attribute("value")) - selected = True - break + priority = 1 + elif f", {paon_str}A," in option_text: + priority = 2 + elif f" {paon_str} " in option_text: + # Fallback substring match (e.g. "ANNEXE 91 THE COMMON"). + priority = 3 + else: + continue + + if priority < best_priority: + best_priority = priority + best_value = addr_option.get_attribute("value") + if priority == 0: + break - if not selected: + if best_value is None: raise ValueError( f"Address not found for postcode {user_postcode} and house number {user_paon}" ) + select.select_by_value(best_value) - wait = WebDriverWait(driver, 30) - wait.until(EC.presence_of_element_located((By.ID, "collection-cards"))) + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.ID, "collection-cards")) + ) + # Wait until at least one card has populated — the JS observer + # builds cards asynchronously after the select change, and reading + # page_source the instant #collection-cards appears can return an + # empty container. Bound on cards rather than a fixed sleep. + WebDriverWait(driver, 30).until( + lambda d: len( + d.find_elements( + By.CSS_SELECTOR, "#collection-cards .card h3" + ) + ) + > 0 + ) # Parse the HTML content soup = BeautifulSoup(driver.page_source, "html.parser") @@ -113,24 +155,28 @@ def parse_data(self, page: str, **kwargs) -> dict: p_tags = card.find_all("p") # any

for p_tag in p_tags: - if p_tag.get_text().startswith("Frequency"): + text = p_tag.get_text() + if text.startswith("Frequency"): continue - # Collect text in p excluding the strong tag - date_str = (p_tag.get_text()).split(":")[1] - if " - " in date_str: - date_str = date_str.split(" - ")[1] - - collection_date = datetime.strptime(date_str, "%a %d %b %Y") - - # print(collection_date.strftime(date_format)) # Tue 03 Feb 2026 - - # Create the dictionary with the formatted data - dict_data = { - "type": collection_type, - "collectionDate": collection_date.strftime(date_format), - } - data["bins"].append(dict_data) + # A single

can contain multiple dates — the + # "Following Collections:" tag renders comma-separated + # dates when the council has 3 upcoming collections, + # and "Next Collection:" can be prefixed with + # "Today - ". Pull every well-formed date out + # of the text and emit one entry per date. + for date_str in _DATE_RE.findall(text): + collection_date = datetime.strptime( + date_str, "%a %d %b %Y" + ) + data["bins"].append( + { + "type": collection_type, + "collectionDate": collection_date.strftime( + date_format + ), + } + ) except Exception as e: # Here you can log the exception if needed print(f"An error occurred: {e}")