Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import datetime
import time
import re
from datetime import datetime

from bs4 import BeautifulSoup
Expand All @@ -11,6 +11,16 @@
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass


# Matches the council's date format e.g. "Wed 27 May 2026". Used to extract
# every date in a <p> tag — a single tag can contain multiple dates separated
# by commas when the council renders "Following Collections:" with two
# upcoming dates, and may be prefixed "Today - <date>".
_DATE_RE = re.compile(
r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun) \d{1,2} "
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}\b"
)


# import the wonderful Beautiful Soup and the URL grabber
class CouncilClass(AbstractGetBinDataClass):
"""
Expand Down Expand Up @@ -41,15 +51,8 @@ def parse_data(self, page: str, **kwargs) -> dict:
driver = create_webdriver(web_driver, headless, None, __name__)
driver.get(url)

wait = WebDriverWait(driver, 30)
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '[aria-label="Postcode"]')
)
)

# Enter postcode
postcode_input = WebDriverWait(driver, 10).until(
postcode_input = WebDriverWait(driver, 30).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '[aria-label="Postcode"]')
)
Expand All @@ -63,43 +66,82 @@ def parse_data(self, page: str, **kwargs) -> dict:
driver.execute_script("arguments[0].scrollIntoView();", find_address_button)
driver.execute_script("arguments[0].click();", find_address_button)

time.sleep(5)
# Wait for address dropdown
select_address_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "select"))
)

# Select address based on postcode and house number
# Wait for address dropdown to appear AND populate. Polling for a
# populated <select> beats a fixed sleep — the dropdown typically
# renders in <1s, so a 5s sleep just held Chrome idle and let the
# parallel Grid+local Chromes overlap memory peaks. On a busy
# Selenium Grid that overlap is what triggered the
# `__clone` / OOM crash inside the Grid container. Now we exit as
# soon as options are available.
def _populated_select(d):
selects = d.find_elements(By.CSS_SELECTOR, "select")
for s in selects:
if len(s.find_elements(By.TAG_NAME, "option")) > 1:
return s
return False

select_address_input = WebDriverWait(driver, 30).until(_populated_select)

# Select address based on postcode and house number. Iterate and
# prefer an exact-prefix match over substring matches so e.g.
# "ANNEXE 91 THE COMMON" doesn't beat "91 THE COMMON" when the
# caller asked for house number 91.
select = Select(select_address_input)
selected = False
postcode_upper = user_postcode.upper()
paon_str = str(user_paon).upper()

best_value = None
best_priority = 99
for addr_option in select.options:
if not addr_option.text or addr_option.text == "Please Select...":
if not addr_option.text or addr_option.text.strip() == "":
continue

option_text = addr_option.text.upper()
postcode_upper = user_postcode.upper()
paon_str = str(user_paon).upper()
if postcode_upper not in option_text:
continue

# Check if this option contains both postcode and house number
if postcode_upper in option_text and (
f"{paon_str} " in option_text
or f", {paon_str}," in option_text
if option_text.startswith(f"{paon_str} "):
priority = 0
elif (
f", {paon_str}," in option_text
or f", {paon_str} " in option_text
or f", {paon_str}A," in option_text
or option_text.endswith(f", {paon_str}")
):
select.select_by_value(addr_option.get_attribute("value"))
selected = True
break
priority = 1
elif f", {paon_str}A," in option_text:
priority = 2
elif f" {paon_str} " in option_text:
# Fallback substring match (e.g. "ANNEXE 91 THE COMMON").
priority = 3
else:
continue

if priority < best_priority:
best_priority = priority
best_value = addr_option.get_attribute("value")
if priority == 0:
break

if not selected:
if best_value is None:
raise ValueError(
f"Address not found for postcode {user_postcode} and house number {user_paon}"
)
select.select_by_value(best_value)

wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.ID, "collection-cards")))
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "collection-cards"))
)
# Wait until at least one card has populated — the JS observer
# builds cards asynchronously after the select change, and reading
# page_source the instant #collection-cards appears can return an
# empty container. Bound on cards rather than a fixed sleep.
WebDriverWait(driver, 30).until(
lambda d: len(
d.find_elements(
By.CSS_SELECTOR, "#collection-cards .card h3"
)
)
> 0
)

# Parse the HTML content
soup = BeautifulSoup(driver.page_source, "html.parser")
Expand All @@ -113,24 +155,28 @@ def parse_data(self, page: str, **kwargs) -> dict:
p_tags = card.find_all("p") # any <p>

for p_tag in p_tags:
if p_tag.get_text().startswith("Frequency"):
text = p_tag.get_text()
if text.startswith("Frequency"):
continue

# Collect text in p excluding the strong tag
date_str = (p_tag.get_text()).split(":")[1]
if " - " in date_str:
date_str = date_str.split(" - ")[1]

collection_date = datetime.strptime(date_str, "%a %d %b %Y")

# print(collection_date.strftime(date_format)) # Tue 03 Feb 2026

# Create the dictionary with the formatted data
dict_data = {
"type": collection_type,
"collectionDate": collection_date.strftime(date_format),
}
data["bins"].append(dict_data)
# A single <p> can contain multiple dates — the
# "Following Collections:" tag renders comma-separated
# dates when the council has 3 upcoming collections,
# and "Next Collection:" can be prefixed with
# "Today - <date>". Pull every well-formed date out
# of the text and emit one entry per date.
for date_str in _DATE_RE.findall(text):
collection_date = datetime.strptime(
date_str, "%a %d %b %Y"
)
data["bins"].append(
{
"type": collection_type,
"collectionDate": collection_date.strftime(
date_format
),
}
)
except Exception as e:
# Here you can log the exception if needed
print(f"An error occurred: {e}")
Expand Down