From 99477132125e3ce6f16264d518f30eb9d2cd0283 Mon Sep 17 00:00:00 2001
From: InertiaUK <mark@keepcomputing.co.uk>
Date: Mon, 27 Apr 2026 17:08:21 +0100
Subject: [PATCH] fix: MidSuffolkDistrictCouncil - replace sleep with
 WebDriverWait, fix date parsing, improve address matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

time.sleep(5) after clicking Find Address held Chrome idle at peak memory
while the parallel Grid+local Chrome paths overlapped — causing the
#17 __clone OOM crash in the Selenium Grid container. The dropdown
populates in ~0.4s in practice, so the 5s sleep was pure idle contention.

Changes:
- Replace sleep(5) with WebDriverWait polling for a populated <select>
  (>1 option), cutting ~4.5s off each run and eliminating the overlap
  window that triggered the Grid crash
- Add a second wait for at least one .card h3 after selecting the address,
  since JS builds collection cards asynchronously after the select change
- Fix date parsing: use _DATE_RE regex to extract every well-formed date
  from each <p> — a single tag can contain multiple comma-separated dates
  in the 'Following Collections:' case; old split(':') approach raised
  'unconverted data remains' errors in those cases
- Improve address matching priority: exact prefix > comma-bounded >
  A-suffix > substring, so '91 THE COMMON' beats 'ANNEXE 91 THE COMMON'
  when searching for house number 91
- Remove unused import time
---
 .../councils/MidSuffolkDistrictCouncil.py     | 142 ++++++++++++------
 1 file changed, 94 insertions(+), 48 deletions(-)
diff --git a/uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py
index 1797f9c4dd..adaa65fa0e 100644
--- a/uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py
+++ b/uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py
@@ -1,5 +1,5 @@
 import datetime
-import time
+import re
 from datetime import datetime
 
 from bs4 import BeautifulSoup
@@ -11,6 +11,16 @@
 from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
 
 
+# Matches the council's date format e.g. "Wed 27 May 2026". Used to extract
+# every date in a <p> tag — a single tag can contain multiple dates separated
+# by commas when the council renders "Following Collections:" with two
+# upcoming dates, and may be prefixed "Today - <date>".
+_DATE_RE = re.compile(
+    r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun) \d{1,2} "
+    r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}\b"
+)
+
+
 # import the wonderful Beautiful Soup and the URL grabber
 class CouncilClass(AbstractGetBinDataClass):
     """
@@ -41,15 +51,8 @@ def parse_data(self, page: str, **kwargs) -> dict:
             driver = create_webdriver(web_driver, headless, None, __name__)
             driver.get(url)
 
-            wait = WebDriverWait(driver, 30)
-            wait.until(
-                EC.presence_of_element_located(
-                    (By.CSS_SELECTOR, '[aria-label="Postcode"]')
-                )
-            )
-
             # Enter postcode
-            postcode_input = WebDriverWait(driver, 10).until(
+            postcode_input = WebDriverWait(driver, 30).until(
                 EC.presence_of_element_located(
                     (By.CSS_SELECTOR, '[aria-label="Postcode"]')
                 )
@@ -63,43 +66,82 @@ def parse_data(self, page: str, **kwargs) -> dict:
             driver.execute_script("arguments[0].scrollIntoView();", find_address_button)
             driver.execute_script("arguments[0].click();", find_address_button)
 
-            time.sleep(5)
-            # Wait for address dropdown
-            select_address_input = WebDriverWait(driver, 10).until(
-                EC.presence_of_element_located((By.CSS_SELECTOR, "select"))
-            )
-
-            # Select address based on postcode and house number
+            # Wait for address dropdown to appear AND populate. Polling for a
+            # populated <select> beats a fixed sleep — the dropdown typically
+            # renders in <1s, so a 5s sleep just held Chrome idle and let the
+            # parallel Grid+local Chromes overlap memory peaks. On a busy
+            # Selenium Grid that overlap is what triggered the
+            # `__clone` / OOM crash inside the Grid container. Now we exit as
+            # soon as options are available.
+            def _populated_select(d):
+                selects = d.find_elements(By.CSS_SELECTOR, "select")
+                for s in selects:
+                    if len(s.find_elements(By.TAG_NAME, "option")) > 1:
+                        return s
+                return False
+
+            select_address_input = WebDriverWait(driver, 30).until(_populated_select)
+
+            # Select address based on postcode and house number. Iterate and
+            # prefer an exact-prefix match over substring matches so e.g.
+            # "ANNEXE 91 THE COMMON" doesn't beat "91 THE COMMON" when the
+            # caller asked for house number 91.
             select = Select(select_address_input)
-            selected = False
+            postcode_upper = user_postcode.upper()
+            paon_str = str(user_paon).upper()
 
+            best_value = None
+            best_priority = 99
             for addr_option in select.options:
-                if not addr_option.text or addr_option.text == "Please Select...":
+                if not addr_option.text or addr_option.text.strip() == "":
                     continue
-
                 option_text = addr_option.text.upper()
-                postcode_upper = user_postcode.upper()
-                paon_str = str(user_paon).upper()
+                if postcode_upper not in option_text:
+                    continue
 
-                # Check if this option contains both postcode and house number
-                if postcode_upper in option_text and (
-                    f"{paon_str} " in option_text
-                    or f", {paon_str}," in option_text
+                if option_text.startswith(f"{paon_str} "):
+                    priority = 0
+                elif (
+                    f", {paon_str}," in option_text
                     or f", {paon_str} " in option_text
-                    or f", {paon_str}A," in option_text
                     or option_text.endswith(f", {paon_str}")
                 ):
-                    select.select_by_value(addr_option.get_attribute("value"))
-                    selected = True
-                    break
+                    priority = 1
+                elif f", {paon_str}A," in option_text:
+                    priority = 2
+                elif f" {paon_str} " in option_text:
+                    # Fallback substring match (e.g. "ANNEXE 91 THE COMMON").
+                    priority = 3
+                else:
+                    continue
+
+                if priority < best_priority:
+                    best_priority = priority
+                    best_value = addr_option.get_attribute("value")
+                    if priority == 0:
+                        break
 
-            if not selected:
+            if best_value is None:
                 raise ValueError(
                     f"Address not found for postcode {user_postcode} and house number {user_paon}"
                 )
+            select.select_by_value(best_value)
 
-            wait = WebDriverWait(driver, 30)
-            wait.until(EC.presence_of_element_located((By.ID, "collection-cards")))
+            WebDriverWait(driver, 30).until(
+                EC.presence_of_element_located((By.ID, "collection-cards"))
+            )
+            # Wait until at least one card has populated — the JS observer
+            # builds cards asynchronously after the select change, and reading
+            # page_source the instant #collection-cards appears can return an
+            # empty container. Bound on cards rather than a fixed sleep.
+            WebDriverWait(driver, 30).until(
+                lambda d: len(
+                    d.find_elements(
+                        By.CSS_SELECTOR, "#collection-cards .card h3"
+                    )
+                )
+                > 0
+            )
 
             # Parse the HTML content
             soup = BeautifulSoup(driver.page_source, "html.parser")
@@ -113,24 +155,28 @@ def parse_data(self, page: str, **kwargs) -> dict:
                     p_tags = card.find_all("p")  # any <p>
 
                     for p_tag in p_tags:
-                        if p_tag.get_text().startswith("Frequency"):
+                        text = p_tag.get_text()
+                        if text.startswith("Frequency"):
                             continue
 
-                        # Collect text in p excluding the strong tag
-                        date_str = (p_tag.get_text()).split(":")[1]
-                        if " - " in date_str:
-                            date_str = date_str.split(" - ")[1]
-
-                        collection_date = datetime.strptime(date_str, "%a %d %b %Y")
-
-                        # print(collection_date.strftime(date_format))  # Tue 03 Feb 2026
-
-                        # Create the dictionary with the formatted data
-                        dict_data = {
-                            "type": collection_type,
-                            "collectionDate": collection_date.strftime(date_format),
-                        }
-                        data["bins"].append(dict_data)
+                        # A single <p> can contain multiple dates — the
+                        # "Following Collections:" tag renders comma-separated
+                        # dates when the council has 3 upcoming collections,
+                        # and "Next Collection:" can be prefixed with
+                        # "Today - <date>". Pull every well-formed date out
+                        # of the text and emit one entry per date.
+                        for date_str in _DATE_RE.findall(text):
+                            collection_date = datetime.strptime(
+                                date_str, "%a %d %b %Y"
+                            )
+                            data["bins"].append(
+                                {
+                                    "type": collection_type,
+                                    "collectionDate": collection_date.strftime(
+                                        date_format
+                                    ),
+                                }
+                            )
         except Exception as e:
             # Here you can log the exception if needed
             print(f"An error occurred: {e}")