diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index a807fb2696..cd5c29d025 100755 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -314,7 +314,7 @@ "house_number": "44", "postcode": "BN1 8NE", "skip_get_url": true, - "url": "https://cityclean.brighton-hove.gov.uk/link/collections", + "url": "https://enviroservices.brighton-hove.gov.uk/link/collections", "web_driver": "http://selenium:4444", "wiki_name": "Brighton and Hove", "wiki_note": "Use house number and postcode. Requires Selenium", @@ -705,6 +705,14 @@ "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN.", "LAD24CD": "E08000027" }, + "DumfriesandGallowayCouncil": { + "uprn": "137034556", + "skip_get_url": true, + "url": "https://www.dumfriesandgalloway.gov.uk", + "wiki_name": "Dumfries and Galloway Council", + "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN.", + "LAD24CD": "S12000006" + }, "DundeeCityCouncil": { "uprn": "9059043390", "url": "https://www.dundeecity.gov.uk/", @@ -1385,7 +1393,7 @@ "LondonBoroughHounslow": { "skip_get_url": true, "uprn": "100021577765", - "url": "https://www.hounslow.gov.uk/homepage/86/recycling_and_waste_collection_day_finder", + "url": "https://my.hounslow.gov.uk/service/Waste_and_recycling_collections", "wiki_name": "Hounslow", "wiki_note": "Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).", "LAD24CD": "E09000018" @@ -1547,7 +1555,6 @@ "house_number": "12 Constantine Court Park Road North, Middlesbrough", "skip_get_url": true, "url": "https://www.middlesbrough.gov.uk/recycling-and-rubbish/bin-collection-dates/", - "web_driver": "http://selenium:4444", "wiki_name": "Middlesbrough", "wiki_note": "Pass the entire address without postcode as it appears when you type it on the website. This parser requires a Selenium webdriver.", "LAD24CD": "E06000002" @@ -1652,13 +1659,11 @@ "LAD24CD": "E09000025" }, "NewportCityCouncil": { - "postcode": "NP20 4HE", + "uprn": "100100688819", "skip_get_url": true, - "house_number": "6", "url": "https://www.newport.gov.uk/", - "web_driver": "http://selenium:4444", "wiki_name": "Newport", - "wiki_note": "Pass the postcode and house number in their respective arguments, both wrapped in quotes.", + "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find your UPRN.", "LAD24CD": "W06000022" }, "NorthAyrshireCouncil": { @@ -1761,7 +1766,7 @@ "uprn": "47097627", "url": "https://www.northtyneside.gov.uk/waste-collection-schedule", "wiki_command_url_override": "https://www.northtyneside.gov.uk/waste-collection-schedule/view/XXXXXXXX", - "wiki_name": "North Tyneside", + "wiki_name": "North Tyneside", "wiki_note": "Pass only the UPRN (no postcode). You can find the UPRN using [FindMyAddress](https://www.findmyaddress.co.uk/search).", "LAD24CD": "E08000022" }, @@ -1797,7 +1802,7 @@ "house_number": "2", "postcode": "NR2 3TT", "url": "https://bnr-wrp.whitespacews.com", - "wiki_command_url_override": "hhttps://bnr-wrp.whitespacews.com", + "wiki_command_url_override": "https://bnr-wrp.whitespacews.com", "wiki_name": "Norwich", "wiki_note": "Pass the house number and postcode in their respective parameters.", "LAD24CD": "E07000148" @@ -1957,7 +1962,7 @@ "RochdaleCouncil": { "postcode": "OL11 5BE", "skip_get_url": true, - "uprn": "23049922", + "uprn": "10094358428", "url": "https://webforms.rochdale.gov.uk/BinCalendar", "wiki_name": "Rochdale", "wiki_note": "Provide your UPRN and postcode. You can find your UPRN using [FindMyAddress](https://www.findmyaddress.co.uk/search).", diff --git a/uk_bin_collection/uk_bin_collection/councils/BostonBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BostonBoroughCouncil.py index 0a543093b2..37378e0ef1 100644 --- a/uk_bin_collection/uk_bin_collection/councils/BostonBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/BostonBoroughCouncil.py @@ -1,4 +1,11 @@ +import time + from bs4 import BeautifulSoup +from selenium.common.exceptions import ( + ElementClickInterceptedException, + NoSuchElementException, + TimeoutException, +) from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select @@ -28,13 +35,30 @@ def parse_data(self, page: str, **kwargs) -> dict: check_postcode(user_postcode) # Create Selenium webdriver - driver = create_webdriver(web_driver, headless, None, __name__) + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + driver = create_webdriver(web_driver, headless, user_agent, __name__) driver.get("https://www.boston.gov.uk/findwastecollections") - accept_button = WebDriverWait(driver, timeout=30).until( - EC.element_to_be_clickable((By.NAME, "acceptall")) + # Wait for initial page load and Cloudflare bypass + WebDriverWait(driver, 30).until( + lambda d: "Just a moment" not in d.title and d.title != "" ) - accept_button.click() + time.sleep(3) + + # Try to accept cookies if the banner appears + try: + accept_button = WebDriverWait(driver, timeout=10).until( + EC.element_to_be_clickable((By.NAME, "acceptall")) + ) + accept_button.click() + time.sleep(2) + except ( + TimeoutException, + NoSuchElementException, + ElementClickInterceptedException, + ): + # Cookie banner not present or not clickable; continue without accepting + pass # Wait for the postcode field to appear then populate it inputElement_postcode = WebDriverWait(driver, 30).until( diff --git a/uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py index 4055d8af77..f7d6bdc7a2 100644 --- a/uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py @@ -29,14 +29,14 @@ def parse_data(self, page: str, **kwargs) -> dict: try: data = {"bins": []} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"} - + url = "https://enviroservices.brighton-hove.gov.uk/link/collections" uprn = kwargs.get("uprn") user_paon = kwargs.get("paon") postcode = kwargs.get("postcode") web_driver = kwargs.get("web_driver") headless = kwargs.get("headless") driver = create_webdriver(web_driver, headless, None, __name__) - driver.get(kwargs.get("url")) + driver.get(url) wait = WebDriverWait(driver, 60) post_code_search = wait.until( diff --git a/uk_bin_collection/uk_bin_collection/councils/ChelmsfordCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/ChelmsfordCityCouncil.py index 9483084788..6cad1e8183 100644 --- a/uk_bin_collection/uk_bin_collection/councils/ChelmsfordCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/ChelmsfordCityCouncil.py @@ -1,95 +1,114 @@ import re import time from datetime import datetime, timedelta + import requests from bs4 import BeautifulSoup +from icalevents.icalevents import events from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait -from icalevents.icalevents import events from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass + class CouncilClass(AbstractGetBinDataClass): def parse_data(self, page: str, **kwargs) -> dict: driver = None try: data = {"bins": []} - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"} - + user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)" + postcode = kwargs.get("postcode") user_paon = kwargs.get("paon") web_driver = kwargs.get("web_driver") headless = kwargs.get("headless") - - driver = create_webdriver(web_driver, headless, None, __name__) + + driver = create_webdriver(web_driver, headless, user_agent, __name__) wait = WebDriverWait(driver, 30) - + # Navigate to bin collection page - driver.get("https://www.chelmsford.gov.uk/bins-and-recycling/check-your-collection-day/") - + driver.get( + "https://www.chelmsford.gov.uk/bins-and-recycling/check-your-collection-day/" + ) + # Handle cookie overlay try: accept_btn = wait.until( - EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'ACCEPT')]")) + EC.element_to_be_clickable( + (By.XPATH, "//*[contains(text(), 'ACCEPT')]") + ) ) accept_btn.click() time.sleep(1) - except: + except Exception as e: + # Cookie banner not present or already accepted pass - + # Find postcode input field (dynamic ID) postcode_input = wait.until( - EC.presence_of_element_located((By.XPATH, "//input[contains(@id, '_keyword')]")) + EC.presence_of_element_located( + (By.XPATH, "//input[contains(@id, '_keyword')]") + ) ) postcode_input.clear() postcode_input.send_keys(postcode) - + # Click search button submit_btn = wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "__submitButton")) ) submit_btn.click() - + # Wait for results table wait.until(EC.presence_of_element_located((By.TAG_NAME, "table"))) - + # Get the collection round from the table row soup = BeautifulSoup(driver.page_source, features="html.parser") - + # Find the row containing the address for row in soup.find_all("tr"): if user_paon in row.get_text(): # Extract collection round (e.g., "Tuesday B") row_text = row.get_text() - round_match = re.search(r"(Monday|Tuesday|Wednesday|Thursday|Friday)\s+([AB])", row_text) + round_match = re.search( + r"(Monday|Tuesday|Wednesday|Thursday|Friday)\s+([AB])", row_text + ) if round_match: day = round_match.group(1).lower() letter = round_match.group(2).lower() - ics_url = f"https://www.chelmsford.gov.uk/media/4ipavf0m/{day}-{letter}-calendar.ics" + ics_url = f"https://www.chelmsford.gov.uk/media/t03c4mik/{day}-{letter}-2025-26.ics" break else: - raise ValueError(f"Could not find collection round for address: {user_paon}") - + raise ValueError( + f"Could not find collection round for address: {user_paon}" + ) + # Get events from ICS file within the next 60 days now = datetime.now() future = now + timedelta(days=60) - + # Parse ICS calendar upcoming_events = events(ics_url, start=now, end=future) - + for event in sorted(upcoming_events, key=lambda e: e.start): if event.summary and event.start: - data["bins"].append({ - "type": event.summary, - "collectionDate": event.start.date().strftime(date_format) - }) + collections = event.summary.split(",") + for collection in collections: + data["bins"].append( + { + "type": collection.strip(), + "collectionDate": event.start.date().strftime( + date_format + ), + } + ) except Exception as e: print(f"An error occurred: {e}") raise finally: if driver: driver.quit() - - return data \ No newline at end of file + + return data diff --git a/uk_bin_collection/uk_bin_collection/councils/DerbyCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/DerbyCityCouncil.py index 8f090ec057..a37230fd3f 100644 --- a/uk_bin_collection/uk_bin_collection/councils/DerbyCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/DerbyCityCouncil.py @@ -19,7 +19,7 @@ def parse_data(self, page: str, **kwargs) -> dict: check_uprn(user_uprn) bindata = {"bins": []} - URI = f"https://secure.derby.gov.uk/binday/Binday?search.PremisesId={user_uprn}" + URI = f"https://secure.derby.gov.uk/binday/BinDays/{user_uprn}" # Make the GET request session = requests.Session() diff --git a/uk_bin_collection/uk_bin_collection/councils/DumfriesandGallowayCouncil.py b/uk_bin_collection/uk_bin_collection/councils/DumfriesandGallowayCouncil.py new file mode 100644 index 0000000000..dbe3c19e6f --- /dev/null +++ b/uk_bin_collection/uk_bin_collection/councils/DumfriesandGallowayCouncil.py @@ -0,0 +1,53 @@ +import re +import time +from datetime import datetime, timedelta + +import requests +from bs4 import BeautifulSoup +from icalevents.icalevents import events +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait + +from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass + + +class CouncilClass(AbstractGetBinDataClass): + def parse_data(self, page: str, **kwargs) -> dict: + driver = None + try: + data = {"bins": []} + + user_uprn = kwargs.get("uprn") + check_uprn(user_uprn) + + ics_url = f"https://www.dumfriesandgalloway.gov.uk/bins-recycling/waste-collection-schedule/download/{user_uprn}" + + # Get events from ICS file within the next 60 days + now = datetime.now() + future = now + timedelta(days=60) + + # Parse ICS calendar + upcoming_events = events(ics_url, start=now, end=future) + + for event in sorted(upcoming_events, key=lambda e: e.start): + if event.summary and event.start: + collections = event.summary.split(",") + for collection in collections: + data["bins"].append( + { + "type": collection.strip(), + "collectionDate": event.start.date().strftime( + date_format + ), + } + ) + except Exception as e: + print(f"An error occurred: {e}") + raise + finally: + if driver: + driver.quit() + + return data diff --git a/uk_bin_collection/uk_bin_collection/councils/HartDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/HartDistrictCouncil.py index edce893514..71983a5aa8 100755 --- a/uk_bin_collection/uk_bin_collection/councils/HartDistrictCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/HartDistrictCouncil.py @@ -35,15 +35,19 @@ def parse_data(self, page: str, **kwargs) -> dict: # Iterate through each row for row in rows: - cells = row.find_all("td") + bin_types = row.find("td", class_="bin-service") - # Check if there are exactly 3 cells in the row - if len(cells) == 3: - bin_type = cells[0].get_text(strip=True) - collection_date = self.format_date(cells[2].get_text(strip=True)) + bin_types = bin_types.text.split("&") - # Create a dictionary for each bin and append to the bins list - bins.append({"type": bin_type, "collectionDate": collection_date}) + collection_date = row.find("td", class_="bin-service-date") + + collection_date = self.format_date(collection_date.text.strip()) + + for bin_type in bin_types: + # Create a dictionary for each bin and append to the bins list + bins.append( + {"type": bin_type.strip(), "collectionDate": collection_date} + ) return {"bins": bins} diff --git a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHarrow.py b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHarrow.py index 8009f96578..a1a8027c31 100644 --- a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHarrow.py +++ b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHarrow.py @@ -24,7 +24,8 @@ def parse_data(self, page: str, **kwargs) -> dict: URI = f"https://www.harrow.gov.uk/ajax/bins?u={user_uprn}&r=12345" # Make the GET request - response = requests.get(URI) + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"} + response = requests.get(URI, headers=headers, timeout=30) # Parse the JSON response bin_collection = response.json() diff --git a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHounslow.py b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHounslow.py index 27a7d9574a..8ff3fe6b3b 100644 --- a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHounslow.py +++ b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughHounslow.py @@ -1,4 +1,7 @@ -from bs4 import BeautifulSoup +import time + +import requests + from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass @@ -12,71 +15,97 @@ class CouncilClass(AbstractGetBinDataClass): """ def parse_data(self, page: str, **kwargs) -> dict: - api_url = "https://www.hounslow.gov.uk/homepage/86/recycling_and_waste_collection_day_finder" user_uprn = kwargs.get("uprn") - - # Check the UPRN is valid check_uprn(user_uprn) + bindata = {"bins": []} + + SESSION_URL = "https://my.hounslow.gov.uk/authapi/isauthenticated?uri=https%253A%252F%252Fmy.hounslow.gov.uk%252Fservice%252FWaste_and_recycling_collections&hostname=my.hounslow.gov.uk&withCredentials=true" + + API_URL = "https://my.hounslow.gov.uk/apibroker/runLookup" + + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "User-Agent": "Mozilla/5.0", + "X-Requested-With": "XMLHttpRequest", + "Referer": "https://my.hounslow.gov.uk/fillform/?iframe_id=fillform-frame-1&db_id=", + } + s = requests.session() + r = s.get(SESSION_URL) + r.raise_for_status() + session_data = r.json() + sid = session_data["auth-session"] + params = { + "id": "655f4290810cf", + "repeat_against": "", + "noRetry": "true", + "getOnlyTokens": "undefined", + "log_id": "", + "app_name": "AF-Renderer::Self", + # unix_timestamp + "_": str(int(time.time() * 1000)), + "sid": sid, + } - # Create the form data - form_data = { - "UPRN": user_uprn, + r = s.post(API_URL, headers=headers, params=params) + r.raise_for_status() + + data = r.json() + rows_data = data["integration"]["transformed"]["rows_data"]["0"] + if not isinstance(rows_data, dict): + raise ValueError("Invalid data returned from API") + token = rows_data["bartecToken"] + + data = { + "formValues": { + "Your address": { + "searchUPRN": { + "value": user_uprn, + }, + "bartecToken": { + "value": token, + }, + "searchFromDate": { + "value": datetime.now().strftime("%Y-%m-%d"), + }, + "searchToDate": { + "value": (datetime.now() + timedelta(days=30)).strftime( + "%Y-%m-%d" + ), + }, + }, + }, + } + + params = { + "id": "659eb39b66d5a", + "repeat_against": "", + "noRetry": "false", + "getOnlyTokens": "undefined", + "log_id": "", + "app_name": "AF-Renderer::Self", + # unix_timestamp + "_": str(int(time.time() * 1000)), + "sid": sid, } - # Make a request to the API - requests.packages.urllib3.disable_warnings() - response = requests.post(api_url, data=form_data) - - # Make a BS4 object - soup = BeautifulSoup(response.text, features="html.parser") - soup.prettify() - - data = {"bins": []} - - # Get the div element - div_element = soup.find("div", {"class": "bin_day_main_wrapper"}) - - # Get all bins with their corresponding dates using list comprehension - # This creates a list of tuples, where each tuple contains the bin type and collection date - bins_with_dates = [ - ( - bin.get_text().strip(), - h4.get_text().replace("This ", "").replace("Next ", ""), - ) - # This first for loop iterates over each h4 element - for h4 in div_element.find_all("h4") - # This nested for loop iterates over each li element within the corresponding ul element - for bin in h4.find_next_sibling("ul").find_all("li") - ] - - for bin_type, collection_date in bins_with_dates: - if "-" in collection_date: - date_part = collection_date.split(" - ")[1] - data["bins"].append( - { - "type": bin_type, - "collectionDate": datetime.strptime( - date_part, "%d %b %Y" - ).strftime(date_format), - } - ) - elif len(collection_date.split(" ")) == 4: - data["bins"].append( - { - "type": bin_type, - "collectionDate": datetime.strptime( - collection_date, "%A %d %b %Y" - ).strftime(date_format), - } - ) - else: - data["bins"].append( - { - "type": bin_type, - "collectionDate": datetime.strptime( - collection_date, "%d %b %Y" - ).strftime(date_format), - } - ) - - return data + r = s.post(API_URL, json=data, headers=headers, params=params) + r.raise_for_status() + + data = r.json() + rows_data = data["integration"]["transformed"]["rows_data"]["0"] + if not isinstance(rows_data, dict): + raise ValueError("Invalid data returned from API") + + collections = json.loads(rows_data["jobsJSON"]) + + for collection in collections: + dict_data = { + "type": collection["jobType"], + "collectionDate": datetime.strptime( + collection["jobDate"], "%Y-%m-%d" + ).strftime(date_format), + } + bindata["bins"].append(dict_data) + + return bindata diff --git a/uk_bin_collection/uk_bin_collection/councils/MiddlesbroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MiddlesbroughCouncil.py index 59ba493098..05913f641d 100644 --- a/uk_bin_collection/uk_bin_collection/councils/MiddlesbroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/MiddlesbroughCouncil.py @@ -1,17 +1,12 @@ +import re import time -from datetime import datetime +from datetime import date, datetime -from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import Select, WebDriverWait +import requests from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass -import re - class CouncilClass(AbstractGetBinDataClass): def parse_data(self, page: str, **kwargs) -> dict: @@ -19,91 +14,107 @@ def parse_data(self, page: str, **kwargs) -> dict: data = {"bins": []} user_paon = kwargs.get("paon") - headless = kwargs.get("headless") - web_driver = kwargs.get("web_driver") - driver = create_webdriver(web_driver, headless, None, __name__) - - page = "https://www.middlesbrough.gov.uk/recycling-and-rubbish/bin-collection-dates/" - driver.get(page) - - address_box = WebDriverWait(driver, timeout=15).until( - EC.presence_of_element_located((By.ID, "row-input-0")) - ) - address_box.click() - address_box.send_keys(user_paon) - - search_button = WebDriverWait(driver, timeout=15).until( - EC.presence_of_element_located((By.ID, "rCbtn-search")) - ) - search_button.click() - - iframe_presense = WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.ID, "recollect-frame")) - ) - driver.switch_to.frame(iframe_presense) - - results = WebDriverWait(driver, timeout=15).until( - EC.presence_of_element_located((By.ID, "rCpage-place_calendar")) - ) - - html_content = driver.page_source - soup = BeautifulSoup(html_content, "html.parser") - - calendar_section = soup.find("section", {"id": "alt-calendar-list"}) - if not calendar_section: - raise ValueError("Calendar section not found in the HTML.") - - date_headers = calendar_section.find_all("h3") - collection_lists = calendar_section.find_all("ul") - - current_month = datetime.now().month - current_year = datetime.now().year - - for date_header, collection_list in zip(date_headers, collection_lists): - raw_date = date_header.text.strip() - - # **Regex to match "Wednesday, February 19" format** - match = re.match(r"([A-Za-z]+), ([A-Za-z]+) (\d{1,2})", raw_date) - - if match: - day_name, month_name, day_number = ( - match.groups() - ) # Extract components - extracted_month = datetime.strptime(month_name, "%B").month - extracted_day = int(day_number) - - # Handle Dec-Jan rollover: If month is before the current month, assume next year - inferred_year = ( - current_year + 1 - if extracted_month < current_month - else current_year - ) - # **Correct the raw_date format before parsing** - raw_date = f"{day_name}, {month_name} {day_number}, {inferred_year}" - - print( - f"DEBUG: Final raw_date before parsing -> {raw_date}" - ) # Debugging output - - # Convert to required format (%d/%m/%Y) - try: - parsed_date = datetime.strptime(raw_date, "%A, %B %d, %Y") - formatted_date = parsed_date.strftime(date_format) - except ValueError: - raise ValueError(f"Date format error after inference: {raw_date}") - - for li in collection_list.find_all("li"): - bin_type = li.get_text(strip=True).split(".")[0] + check_paon(user_paon) + + url = "https://api.eu.recollect.net/api/areas/MiddlesbroughUK/services/50005/address-suggest" + params = { + "q": user_paon, + "locale": "en-GB", + "_": str(int(time.time() * 1000)), + } + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", + } + + response = requests.get(url, headers=headers, params=params) + + addresses = response.json() + for address in addresses: + if "place_id" in address: + place_id = address["place_id"] + break + + if not place_id: + print(f"An error occurred: retrieving the address") + return + + url = "https://api.eu.recollect.net/api/areas/MiddlesbroughUK/services/50005/pages/en-GB/place_calendar.json?widget_config=%7B%22area%22%3A%22MiddlesbroughUK%22%2C%22name%22%3A%22calendar%22%2C%22base%22%3A%22https%3A%2F%2Frecollect.net%22%2C%22third_party_cookie_enabled%22%3A1%2C%22place_not_found_in_guest%22%3A0%2C%22is_guest_service%22%3A0%7D" + params = { + "q": user_paon, + "locale": "en-GB", + "_": str(int(time.time() * 1000)), + } + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", + "x-recollect-place": place_id + ":50005", + } + response = requests.get(url, headers=headers, params=params) + # response = response.json() + + def extract_next_collection(payload: dict): + # 1) Find the "Next Collection" section + sections = payload.get("sections", []) + next_col_section = next( + (s for s in sections if s.get("title") == "Next Collection"), None + ) + if not next_col_section: + return None + + rows = next_col_section.get("rows", []) + + # 2) First row is the date inside + next_date = None + if rows and rows[0].get("type") == "html": + html = rows[0].get("html", "") + # grab text inside + m = re.search(r"(.*?)", html, flags=re.I | re.S) + if m: + date_text = m.group(1).strip() + # e.g. "Wednesday, October 29, 2025" + try: + next_date = datetime.strptime( + date_text, "%A, %B %d, %Y" + ).date() + except ValueError: + # Fallback: strip tags and leave raw text if format changes + next_date = date_text + + # 3) Remaining rows of type "rich-content" hold the bin types + bins = [] + for r in rows[1:]: + if r.get("type") == "rich-content": + label = r.get("label") or r.get( + "html" + ) # "Refuse", "Recycling", etc. + flag = (r.get("data") or {}).get( + "flag" + ) # "REFUSE", "RECYCLING", etc. + if label or flag: + bins.append({"label": label, "flag": flag}) + + return {"date": next_date, "bins": bins} + + # Example: + result = extract_next_collection(response.json()) + + if result and result.get("date") and result.get("bins"): + d = result["date"] + formatted_date = ( + d.strftime(date_format) if isinstance(d, date) else str(d) + ) + + for b in result["bins"]: + bin_type = b.get("label") or b.get( + "flag" + ) # e.g., "Refuse" or "RECYCLING" + if not bin_type: + continue data["bins"].append( {"type": bin_type, "collectionDate": formatted_date} ) + return data except Exception as e: print(f"An error occurred: {e}") raise - finally: - if driver: - driver.quit() - - return data diff --git a/uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py index 6b56175160..462f3b9546 100644 --- a/uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py @@ -1,15 +1,27 @@ -import datetime +import json +from dataclasses import asdict, dataclass +from typing import Literal -from bs4 import BeautifulSoup -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.wait import WebDriverWait +import requests +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import padding +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes -from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.common import check_uprn from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass +key_hex = "F57E76482EE3DC3336495DEDEEF3962671B054FE353E815145E29C5689F72FEC" +iv_hex = "2CBF4FC35C69B82362D393A4F0B9971A" + + +@dataclass +class NewportInput: + P_CLIENT_ID: Literal[130] + P_COUNCIL_ID: Literal[260] + P_LANG_CODE: Literal["EN"] + P_UPRN: str + -# import the wonderful Beautiful Soup and the URL grabber class CouncilClass(AbstractGetBinDataClass): """ Concrete classes have to implement all abstract operations of the @@ -17,116 +29,73 @@ class CouncilClass(AbstractGetBinDataClass): implementation. """ - def parse_data(self, page: str, **kwargs) -> dict: - driver = None - try: - data = {"bins": []} - url = kwargs.get("url") - user_paon = kwargs.get("paon") - user_postcode = kwargs.get("postcode") - web_driver = kwargs.get("web_driver") - headless = kwargs.get("headless") - check_paon(user_paon) - check_postcode(user_postcode) - - # Use a realistic user agent to help bypass Cloudflare - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - driver = create_webdriver(web_driver, headless, user_agent, __name__) - driver.get( - "https://iportal.itouchvision.com/icollectionday/collection-day/?uuid=6CDD2A34C912312074D8E2410531401A8C00EFF7&lang=en" - ) + def encode_body(self, newport_input: NewportInput): + key = bytes.fromhex(key_hex) + iv = bytes.fromhex(iv_hex) - # Wait for the postcode field to appear then populate it - inputElement_postcode = WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.ID, "postcodeSearch")) - ) - inputElement_postcode.send_keys(user_postcode) + json_data = json.dumps(asdict(newport_input)) + data_bytes = json_data.encode("utf-8") - # Click search button - findAddress = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.CLASS_NAME, "govuk-button")) - ) - findAddress.click() - - # Wait for the 'Select address' dropdown to appear and select option matching the house name/number - WebDriverWait(driver, 10).until( - EC.element_to_be_clickable( - ( - By.XPATH, - "//select[@id='addressSelect']//option[contains(., '" - + user_paon - + "')]", - ) - ) - ).click() - - # Wait for the collections table to appear - WebDriverWait(driver, 20).until( - EC.presence_of_element_located( - ( - By.XPATH, - "//h2[contains(@class,'mt-4') and contains(@class,'govuk-heading-s') and normalize-space(.)='Your next collections']", - ) - ) - ) + padder = padding.PKCS7(128).padder() + padded_data = padder.update(data_bytes) + padder.finalize() - soup = BeautifulSoup(driver.page_source, features="html.parser") + backend = default_backend() + cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=backend) + encryptor = cipher.encryptor() + ciphertext = encryptor.update(padded_data) + encryptor.finalize() - collections = soup.find_all("div", {"class": "p-2"}) + return ciphertext.hex() - for collection in collections: - bin_type = collection.find("h3").get_text() + def decode_response(self, hex_input: str): - next_collection = soup.find("div", {"class": "fw-bold"}).get_text() + key = bytes.fromhex(key_hex) + iv = bytes.fromhex(iv_hex) + ciphertext = bytes.fromhex(hex_input) - following_collection = soup.find( - lambda t: ( - t.name == "div" - and t.get_text(strip=True).lower().startswith("followed by") - ) - ).get_text() + backend = default_backend() + cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=backend) + decryptor = cipher.decryptor() + decrypted_padded = decryptor.update(ciphertext) + decryptor.finalize() - next_collection_date = datetime.strptime(next_collection, "%A %d %B") + unpadder = padding.PKCS7(128).unpadder() + plaintext_bytes = unpadder.update(decrypted_padded) + unpadder.finalize() + plaintext = plaintext_bytes.decode("utf-8") - following_collection_date = datetime.strptime( - following_collection, "followed by %A %d %B" - ) + return json.loads(plaintext) - current_date = datetime.now() - next_collection_date = next_collection_date.replace( - year=current_date.year - ) - following_collection_date = following_collection_date.replace( - year=current_date.year - ) - - next_collection_date = get_next_occurrence_from_day_month( - next_collection_date - ) + def parse_data(self, _: str, **kwargs) -> dict: + try: + user_uprn: str = kwargs.get("uprn") or "" + check_uprn(user_uprn) + newport_input = NewportInput( + P_CLIENT_ID=130, P_COUNCIL_ID=260, P_LANG_CODE="EN", P_UPRN=user_uprn + ) - following_collection_date = get_next_occurrence_from_day_month( - following_collection_date - ) + encoded_input = self.encode_body(newport_input) - dict_data = { - "type": bin_type, - "collectionDate": next_collection_date.strftime(date_format), - } - data["bins"].append(dict_data) + session = requests.Session() + response = session.post( + "https://iweb.itouchvision.com/portal/itouchvision/kmbd/collectionDay", + data=encoded_input, + ) - dict_data = { - "type": bin_type, - "collectionDate": following_collection_date.strftime(date_format), - } - data["bins"].append(dict_data) + output = response.text + + decoded_bins = self.decode_response(output) + data: dict[str, list[dict[str, str]]] = {} + data["bins"] = list( + map( + lambda a: { + "type": a["binType"], + "collectionDate": a["collectionDay"].replace("-", "/"), + }, + decoded_bins["collectionDay"], + ) + ) except Exception as e: # Here you can log the exception if needed print(f"An error occurred: {e}") # Optionally, re-raise the exception if you want it to propagate raise - finally: - # This block ensures that the driver is closed regardless of an exception - if driver: - driver.quit() return data diff --git a/uk_bin_collection/uk_bin_collection/councils/RochdaleCouncil.py b/uk_bin_collection/uk_bin_collection/councils/RochdaleCouncil.py index cd019149fe..93dbf90bd1 100644 --- a/uk_bin_collection/uk_bin_collection/councils/RochdaleCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/RochdaleCouncil.py @@ -1,4 +1,7 @@ -from bs4 import BeautifulSoup +import time + +import requests + from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass @@ -12,58 +15,100 @@ class CouncilClass(AbstractGetBinDataClass): """ def parse_data(self, page: str, **kwargs) -> dict: - api_url = "https://webforms.rochdale.gov.uk/BinCalendar" user_postcode = kwargs.get("postcode") user_uprn = kwargs.get("uprn") - - # Check the postcode and UPRN are valid check_postcode(user_postcode) check_uprn(user_uprn) + bindata = {"bins": []} - # Create the form data - form_data = { - "PostCode": user_postcode, - "SelectedUprn": user_uprn, - "Step": 2, - } + SESSION_URL = "https://rochdale-self.achieveservice.com/authapi/isauthenticated?uri=https%253A%252F%252Frochdale-self.achieveservice.com%252Fservice%252FBins___view_your_waste_collection_calendar&hostname=rochdale-self.achieveservice.com&withCredentials=true" - # Make a request to the API - requests.packages.urllib3.disable_warnings() - response = requests.post(api_url, data=form_data) + API_URL = "https://rochdale-self.achieveservice.com/apibroker/runLookup" - # Make a BS4 object - soup = BeautifulSoup(response.text, features="html.parser") - soup.prettify() + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "User-Agent": "Mozilla/5.0", + "X-Requested-With": "XMLHttpRequest", + "Referer": "https://rochdale-self.achieveservice.com/fillform/?iframe_id=fillform-frame-1&db_id=", + } + s = requests.session() + r = s.get(SESSION_URL) + r.raise_for_status() + session_data = r.json() + sid = session_data["auth-session"] + params = { + "id": "6846c784a46b5", + "repeat_against": "", + "noRetry": "true", + "getOnlyTokens": "undefined", + "log_id": "", + "app_name": "AF-Renderer::Self", + # unix_timestamp + "_": str(int(time.time() * 1000)), + "sid": sid, + } - data = {"bins": []} + r = s.post(API_URL, headers=headers, params=params) + r.raise_for_status() - # Get the table element and rows - table_element = soup.find("table", {"id": "tblCollectionDetails"}) - table_rows = table_element.find_all_next("tr") + data = r.json() + rows_data = data["integration"]["transformed"]["rows_data"]["0"] + if not isinstance(rows_data, dict): + raise ValueError("Invalid data returned from API") + token = rows_data["bartecToken"] - row_index = 0 - for row in table_rows: - if row_index < 1: - row_index += 1 - continue - else: - # Get the date from the th element - date = datetime.strptime( - row.find("th").get_text().strip(), "%A %d %B %Y" - ).strftime(date_format) + data = { + "formValues": { + "Location details": { + "propertyUPRN": { + "value": user_uprn, + }, + "postcode_search": { + "value": user_postcode, + }, + "bartecToken": { + "value": token, + }, + "dateMinimum": { + "value": datetime.now().strftime("%Y-%m-%d"), + }, + "dateMaximum": { + "value": (datetime.now() + timedelta(days=30)).strftime( + "%Y-%m-%d" + ), + }, + }, + }, + } - # Get the bin types from the td elements and filter out the empty ones - bin_types = filter(lambda td: td.find("img"), row.find_all("td")) + params = { + "id": "686e9147a867e", + "repeat_against": "", + "noRetry": "true", + "getOnlyTokens": "undefined", + "log_id": "", + "app_name": "AF-Renderer::Self", + # unix_timestamp + "_": str(int(time.time() * 1000)), + "sid": sid, + } - # Convert the bin types to a list - bin_types_list = list(bin_types) + r = s.post(API_URL, json=data, headers=headers, params=params) + r.raise_for_status() - # Append the bin type and date to the data dict - for td in bin_types_list: - img = td.find("img") - bin_type_text = img["alt"] - data["bins"].append({"type": bin_type_text, "collectionDate": date}) + data = r.json() + rows_data = data["integration"]["transformed"]["rows_data"] + if not isinstance(rows_data, dict): + raise ValueError("Invalid data returned from API") - row_index += 1 + for key, value in rows_data.items(): + dict_data = { + "type": value["bartecBinType"], + "collectionDate": datetime.strptime( + value["bartecBinStartDate"], "%Y-%m-%dT%H:%M:%S" + ).strftime(date_format), + } + bindata["bins"].append(dict_data) - return data + return bindata diff --git a/uk_bin_collection/uk_bin_collection/councils/SouthamptonCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SouthamptonCityCouncil.py index ce4b8f148d..dba1c707e5 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SouthamptonCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SouthamptonCityCouncil.py @@ -22,9 +22,31 @@ def parse_data(self, page: str, **kwargs) -> dict: REGEX = r"(Glass|Recycling|General Waste|Garden Waste).*?([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4})" - s = requests.Session() - r = s.get( - f"https://www.southampton.gov.uk/whereilive/waste-calendar?UPRN={user_uprn}" + headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-GB,en;q=0.9", + "cache-control": "max-age=0", + "dnt": "1", + "priority": "u=0, i", + "referer": "https://www.southampton.gov.uk", + "sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36", + } + + params = { + "UPRN": {user_uprn}, + } + + r = requests.get( + "https://www.southampton.gov.uk/whereilive/waste-calendar", + params=params, + headers=headers, ) r.raise_for_status() diff --git a/uk_bin_collection/uk_bin_collection/councils/WokinghamBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WokinghamBoroughCouncil.py index 7881e0ac48..a98916cafe 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WokinghamBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WokinghamBoroughCouncil.py @@ -1,3 +1,4 @@ +from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC @@ -68,25 +69,39 @@ def parse_data(self, page: str, **kwargs) -> dict: # Wait for the collection dates elements to load collection_date_cards = WebDriverWait(driver, timeout).until( EC.presence_of_all_elements_located( - (By.XPATH, '//div[@class = "card__content"]') + (By.XPATH, '//div[@class = "card card--waste card--blue-light"]') ) ) - for collection_date_card in collection_date_cards: - waste_type = collection_date_card.find_element( - By.XPATH, './/h3[@class = "heading heading--sub heading--tiny"]' - ) - collection_date = collection_date_card.find_element( - By.XPATH, './/span[@class = "card__date"]' - ) - dt_collection_date = datetime.strptime( - collection_date.text.split(" ")[1], source_date_format + soup = BeautifulSoup(driver.page_source, features="html.parser") + + collection_cards = soup.find_all( + "div", {"class": "card card--waste card--blue-light"} + ) + + for collection_card in collection_cards: + collection_date_cards = collection_card.find_all( + "div", {"class": "card__content"} ) - dict_data = { - "type": waste_type.text, - "collectionDate": dt_collection_date.strftime(date_format), - } - data["bins"].append(dict_data) + + for collection_date_card in collection_date_cards: + + waste_type = collection_date_card.find( + "h3", {"class": "heading heading--sub heading--tiny"} + ) + + collection_date = collection_date_card.find( + "span", {"class": "card__date"} + ) + + dt_collection_date = datetime.strptime( + collection_date.text.strip().split(" ")[1], source_date_format + ) + dict_data = { + "type": waste_type.text.strip().split("(")[0].strip(), + "collectionDate": dt_collection_date.strftime(date_format), + } + data["bins"].append(dict_data) except Exception as e: # Here you can log the exception if needed print(f"An error occurred: {e}") diff --git a/wiki/Councils.md b/wiki/Councils.md index 0c8ddfd359..04f629d48d 100644 --- a/wiki/Councils.md +++ b/wiki/Councils.md @@ -93,6 +93,7 @@ This document is still a work in progress, don't worry if your council isn't lis - [Dorset Council](#dorset-council) - [Dover](#dover) - [Dudley](#dudley) +- [Dumfries and Galloway Council](#dumfries-and-galloway-council) - [Dundee City](#dundee-city) - [County Durham](#county-durham) - [Ealing](#ealing) @@ -785,7 +786,7 @@ Note: Pass the house number and postcode in their respective parameters. ### Brighton and Hove ```commandline -python collect_data.py BrightonandHoveCityCouncil https://cityclean.brighton-hove.gov.uk/link/collections -s -p "XXXX XXX" -n XX -w http://HOST:PORT/ +python collect_data.py BrightonandHoveCityCouncil https://enviroservices.brighton-hove.gov.uk/link/collections -s -p "XXXX XXX" -n XX -w http://HOST:PORT/ ``` Additional parameters: - `-s` - skip get URL @@ -1358,6 +1359,18 @@ Note: You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/searc --- +### Dumfries and Galloway Council +```commandline +python collect_data.py DumfriesandGallowayCouncil https://www.dumfriesandgalloway.gov.uk -s -u XXXXXXXX +``` +Additional parameters: +- `-s` - skip get URL +- `-u` - UPRN + +Note: You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN. + +--- + ### Dundee City ```commandline python collect_data.py DundeeCityCouncil https://www.dundeecity.gov.uk/ -u XXXXXXXX @@ -2296,7 +2309,7 @@ Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyadd ### Hounslow ```commandline -python collect_data.py LondonBoroughHounslow https://www.hounslow.gov.uk/homepage/86/recycling_and_waste_collection_day_finder -s -u XXXXXXXX +python collect_data.py LondonBoroughHounslow https://my.hounslow.gov.uk/service/Waste_and_recycling_collections -s -u XXXXXXXX ``` Additional parameters: - `-s` - skip get URL @@ -2528,12 +2541,11 @@ Note: Pass the name of the street with the house number parameter, wrapped in do ### Middlesbrough ```commandline -python collect_data.py MiddlesbroughCouncil https://www.middlesbrough.gov.uk/recycling-and-rubbish/bin-collection-dates/ -s -n XX -w http://HOST:PORT/ +python collect_data.py MiddlesbroughCouncil https://www.middlesbrough.gov.uk/recycling-and-rubbish/bin-collection-dates/ -s -n XX ``` Additional parameters: - `-s` - skip get URL - `-n` - house number -- `-w` - remote Selenium web driver URL (required for Home Assistant) Note: Pass the entire address without postcode as it appears when you type it on the website. This parser requires a Selenium webdriver. @@ -2686,15 +2698,13 @@ Note: You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/searc ### Newport ```commandline -python collect_data.py NewportCityCouncil https://www.newport.gov.uk/ -s -p "XXXX XXX" -n XX -w http://HOST:PORT/ +python collect_data.py NewportCityCouncil https://www.newport.gov.uk/ -s -u XXXXXXXX ``` Additional parameters: - `-s` - skip get URL -- `-p` - postcode -- `-n` - house number -- `-w` - remote Selenium web driver URL (required for Home Assistant) +- `-u` - UPRN -Note: Pass the postcode and house number in their respective arguments, both wrapped in quotes. +Note: You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find your UPRN. --- @@ -2885,7 +2895,7 @@ Note: Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyadd ### Norwich ```commandline -python collect_data.py NorwichCityCouncil hhttps://bnr-wrp.whitespacews.com -p "XXXX XXX" -n XX +python collect_data.py NorwichCityCouncil https://bnr-wrp.whitespacews.com -p "XXXX XXX" -n XX ``` Additional parameters: - `-p` - postcode