diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index 1967b2d9d6..f7883ad42c 100755 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -1558,11 +1558,13 @@ "LAD24CD": "E07000133" }, "MertonCouncil": { - "uprn": "4328213", + "house_number": "16", + "postcode": "SW19 1QT", "skip_get_url": true, + "uprn": "4328213", "url": "https://fixmystreet.merton.gov.uk/waste/", "wiki_name": "Merton", - "wiki_note": "To get the UPRN, you can use [FindMyAddress](https://www.findmyaddress.co.uk/search).", + "wiki_note": "Provide postcode and house number. Merton-specific UPRN also accepted.", "LAD24CD": "E09000024" }, "MidAndEastAntrimBoroughCouncil": { diff --git a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py index 79b64e2676..a706f8f74f 100644 --- a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py @@ -1,4 +1,3 @@ -# This script pulls (in one hit) the data from Merton Council Bins Data import time from datetime import datetime @@ -8,168 +7,116 @@ from uk_bin_collection.uk_bin_collection.common import date_format from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass +BASE_URL = "https://fixmystreet.merton.gov.uk" -# Council class for Merton Council -class CouncilClass(AbstractGetBinDataClass): - """ - Bin collection scraper for Merton Council. - This scraper retrieves bin collection schedules from the Merton Council - FixMyStreet-based website (fixmystreet.merton.gov.uk). The site uses - JavaScript to dynamically load data, requiring polling until content - is fully loaded. +def _resolve_property_id(s, postcode, paon): + resp = s.post(f"{BASE_URL}/waste", data={"postcode": postcode}, timeout=30) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + select = soup.find("select", {"id": "address"}) + if not select: + return None + + paon_lower = (paon or "").strip().lower() + best = None + for opt in select.find_all("option"): + val = opt.get("value", "") + if not val or val == "missing": + continue + text = opt.get_text(strip=True).lower() + if paon_lower and text.startswith(paon_lower): + return val + if not best and val: + best = val - Required Parameters: - uprn (str): Unique Property Reference Number (numeric only) + return best - Example: - >>> council = CouncilClass() - >>> data = council.run(uprn="4328213") - """ - # Polling configuration for JavaScript-loaded data +class CouncilClass(AbstractGetBinDataClass): MAX_POLLING_ATTEMPTS = 10 - POLLING_SLEEP_SECONDS = 2 + POLLING_SLEEP_SECONDS = 3 def parse_data(self, page: str, **kwargs) -> dict: - """ - Parse bin collection data from Merton Council's FixMyStreet website. - - The Merton Council website uses JavaScript to dynamically load collection data. - This method polls the page until the data is fully loaded, then extracts - bin collection information including type and next collection date. - - Args: - page (str): Unused - maintained for interface compatibility - **kwargs: Keyword arguments including: - - uprn (str): Unique Property Reference Number (numeric only) - - Returns: - dict: A dictionary containing a list of bins with their collection dates: - { - "bins": [ - { - "type": str, # Capitalized bin type (e.g., "Food waste") - "collectionDate": str # Formatted date string - }, - ... - ] - } - - Raises: - ValueError: If uprn is not provided or contains non-numeric characters - Exception: If timeout occurs waiting for data or if collections div not found - - Note: - - Skips booking services like "Bulky waste" and "Garden waste" - - Handles year-boundary dates (e.g., December dates for January collections) - - Results are sorted by collection date - """ uprn = kwargs.get("uprn") - if not uprn: - raise ValueError("uprn is required") - - # Validate UPRN format (must be numeric only) - if not str(uprn).isdigit(): - raise ValueError("uprn must contain only numeric characters") + postcode = kwargs.get("postcode") + paon = kwargs.get("paon") - # The new Merton site uses JavaScript to load data dynamically. - # We poll the page until the loading indicator disappears. - url = f"https://fixmystreet.merton.gov.uk/waste/{uprn}?page_loading=1" - headers = { - "x-requested-with": "fetch", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", - } + s = requests.Session() + s.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + } + ) - data = {"bins": []} - collections = [] + property_id = None - # Poll until data is loaded - soup = None - for attempt in range(1, self.MAX_POLLING_ATTEMPTS + 1): - response = requests.get(url, headers=headers, timeout=10) - soup = BeautifulSoup(response.text, features="html.parser") + if uprn and str(uprn).isdigit(): + r = s.get(f"{BASE_URL}/waste/{uprn}?page_loading=1", + headers={"x-requested-with": "fetch"}, timeout=10) + if r.status_code == 200 and not r.url.endswith("/waste"): + property_id = uprn - # Check if still loading - if soup.find(id="loading-indicator"): - if attempt < self.MAX_POLLING_ATTEMPTS: - time.sleep(self.POLLING_SLEEP_SECONDS) - continue - else: - raise Exception("Timeout waiting for bin collection data to load") - break + if not property_id and postcode: + property_id = _resolve_property_id(s, postcode, paon) - # Data loaded, parse it - collections_div = soup.find("div", class_="waste__collections") - if not collections_div: - raise Exception("Collections div not found") + if not property_id: + raise ValueError("Could not resolve property. Provide postcode+address or valid Merton UPRN.") - possible_formats = [ - "%d %B %Y", - "%A %d %B %Y", - ] + url = f"{BASE_URL}/waste/{property_id}?page_loading=1" + headers = {"x-requested-with": "fetch"} - # Skip services that are not scheduled collections (booking services) + data = {"bins": []} + collections = [] skip_services = ["Bulky waste", "Garden waste"] - govuk_grid_column_two_thirds = soup.find( - "div", class_="govuk-grid-column-two-thirds" - ) - waste_service_grids = govuk_grid_column_two_thirds.find_all( - "div", class_="waste-service-grid" - ) - - for waste_service_grid in waste_service_grids: - - h3 = waste_service_grid.find("h3", class_="waste-service-name") - - bin_type = h3.get_text().strip() - - # Skip booking services + soup = None + for attempt in range(self.MAX_POLLING_ATTEMPTS): + response = s.get(url, headers=headers, timeout=10) + soup = BeautifulSoup(response.text, features="html.parser") + if soup.find_all("h3", class_="waste-service-name"): + break + time.sleep(self.POLLING_SLEEP_SECONDS) + else: + raise RuntimeError("Timeout waiting for bin collection data to load") + + grid_parent = soup.find("div", class_="govuk-grid-column-two-thirds") + if not grid_parent: + grid_parent = soup + + for grid in grid_parent.find_all("div", class_="waste-service-grid"): + h3 = grid.find("h3", class_="waste-service-name") + if not h3: + continue + bin_type = h3.get_text(strip=True) if bin_type in skip_services: continue - rows = waste_service_grid.find_all("div", class_="govuk-summary-list__row") - for row in rows: - key = row.find("dt", class_="govuk-summary-list__key") - value = row.find("dd", class_="govuk-summary-list__value") - - if key and value and "Next collection" in key.get_text(): - collection_date_str = value.get_text().strip() - - # Parse the date - format is like "Saturday 15 November" - collectionDate = None - # Try with day of week - date_parts = collection_date_str.split() - if len(date_parts) >= 3: - # Try parsing with day name, day, month - day = date_parts[1] - month = date_parts[2] - year = datetime.now().year - date_str = f"{day} {month} {year}" - - for format in possible_formats: - try: - collectionDate = datetime.strptime(date_str, format) - # Handle year boundary: if parsed date is in the past, assume next year - if collectionDate.date() < datetime.now().date(): - collectionDate = collectionDate.replace( - year=year + 1 - ) - break - except ValueError: - continue - - if collectionDate: - # Add each collection to the list as a tuple - collections.append((bin_type, collectionDate)) - - ordered_data = sorted(collections, key=lambda x: x[1]) - for item in ordered_data: - dict_data = { - "type": item[0].capitalize(), - "collectionDate": item[1].strftime(date_format), - } - data["bins"].append(dict_data) + for row in grid.find_all("div", class_="govuk-summary-list__row"): + key = row.find("dt") + value = row.find("dd") + if not key or not value or "Next collection" not in key.get_text(): + continue + date_text = value.get_text(strip=True) + parts = date_text.split() + if len(parts) < 3: + continue + day_str = parts[1] + month_str = parts[2] + year = datetime.now().year + try: + dt = datetime.strptime(f"{day_str} {month_str} {year}", "%d %B %Y") + if dt.date() < datetime.now().date(): + dt = dt.replace(year=year + 1) + collections.append((bin_type, dt)) + except ValueError: + continue + + ordered = sorted(collections, key=lambda x: x[1]) + for bin_type, dt in ordered: + data["bins"].append({ + "type": bin_type.capitalize(), + "collectionDate": dt.strftime(date_format), + }) return data