Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions uk_bin_collection/tests/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,9 @@
"url": "https://www.exeter.gov.uk",
"wiki_name": "Exeter",
"wiki_note": "Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).",
"LAD24CD": "E07000041"
"LAD24CD": "E07000041",
"postcode": "EX2 4NT",
"house_number": "5"
},
"FalkirkCouncil": {
"uprn": "136065818",
Expand Down Expand Up @@ -1754,14 +1756,14 @@
"LAD24CD": "E06000012"
},
"NorthHertfordshireDistrictCouncil": {
"house_number": "Stewards Flat",
"postcode": "SG5 1PZ",
"skip_get_url": true,
"url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address",
"web_driver": "http://selenium:4444",
"wiki_name": "North Hertfordshire",
"wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.",
"LAD24CD": "E07000099"
"house_number": "Stewards Flat",
"postcode": "SG5 1PZ",
"skip_get_url": true,
"url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address",
"web_driver": "http://selenium:4444",
"wiki_name": "North Hertfordshire",
"wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.",
"LAD24CD": "E07000099"
},
"NorthKestevenDistrictCouncil": {
"skip_get_url": true,
Expand Down
59 changes: 47 additions & 12 deletions uk_bin_collection/uk_bin_collection/councils/ExeterCityCouncil.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import time
import re

import requests
from bs4 import BeautifulSoup
Expand All @@ -7,7 +7,6 @@
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass


# import the wonderful Beautiful Soup and the URL grabber
class CouncilClass(AbstractGetBinDataClass):
"""
Concrete classes have to implement all abstract operations of the
Expand All @@ -16,25 +15,61 @@ class CouncilClass(AbstractGetBinDataClass):
"""

def parse_data(self, page: str, **kwargs) -> dict:

user_uprn = kwargs.get("uprn")
check_uprn(user_uprn)
user_postcode = kwargs.get("postcode")
user_paon = kwargs.get("paon")

bindata = {"bins": []}
results_html = None

# Prefer postcode+house_number lookup (works for all UPRNs)
if user_postcode and user_paon:
response = requests.get(
"https://exeter.gov.uk/repositories/hidden-pages/address-finder/",
params={"qsource": "POSTCODE", "qtype": "bins", "term": user_postcode},
timeout=30,
)
response.raise_for_status()
data = response.json()

if data:
# Extract leading number from paon for matching
paon_num = re.match(r"^(\d+)", str(user_paon).strip())
paon_prefix = paon_num.group(1) if paon_num else str(user_paon).strip()

for entry in data:
label = entry.get("label", "")
if label.strip().startswith(paon_prefix):
results_html = entry.get("Results")
break

URI = f"https://exeter.gov.uk/repositories/hidden-pages/address-finder/?qsource=UPRN&qtype=bins&term={user_uprn}"
# Fallback: first entry if no match
if not results_html and data:
results_html = data[0].get("Results")

response = requests.get(URI)
response.raise_for_status()
# Fall back to UPRN lookup (original method)
if not results_html and user_uprn:
check_uprn(user_uprn)
response = requests.get(
f"https://exeter.gov.uk/repositories/hidden-pages/address-finder/?qsource=UPRN&qtype=bins&term={user_uprn}",
timeout=30,
)
response.raise_for_status()
data = response.json()
if data:
results_html = data[0].get("Results")

data = response.json()
if not results_html:
return bindata
Comment on lines +37 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don't turn postcode misses into another property's schedule.

startswith(paon_prefix) will match "5" against labels like "52 ...", and the data[0]/empty-result fallbacks then hide the miss instead of surfacing it. That can return a valid-looking schedule for the wrong address. Match the leading property token exactly, and if nothing matches, fall through to the UPRN branch or raise a clear exception rather than selecting the first result or returning empty data.

Suggested direction
-                paon_num = re.match(r"^(\d+)", str(user_paon).strip())
-                paon_prefix = paon_num.group(1) if paon_num else str(user_paon).strip()
+                normalized_paon = str(user_paon).strip()
+                paon_num = re.match(r"^(\d+)\b", normalized_paon)
+                paon_prefix = paon_num.group(1) if paon_num else normalized_paon.casefold()

                 for entry in data:
-                    label = entry.get("label", "")
-                    if label.strip().startswith(paon_prefix):
+                    label = entry.get("label", "").strip()
+                    label_num = re.match(r"^(\d+)\b", label)
+                    label_prefix = label_num.group(1) if label_num else label.casefold()
+                    if label_prefix == paon_prefix:
                         results_html = entry.get("Results")
                         break

-                # Fallback: first entry if no match
-                if not results_html and data:
-                    results_html = data[0].get("Results")
+                if not results_html and not user_uprn:
+                    raise ValueError(
+                        f"No Exeter address matched PAON '{user_paon}' within postcode '{user_postcode}'"
+                    )

-        if not results_html:
-            return bindata
+        if not results_html:
+            raise ValueError("Exeter lookup returned no schedule HTML")

Based on learnings: in uk_bin_collection/**/*.py, when parsing council bin collection data, prefer explicit failures (raise exceptions on unexpected formats) over silent defaults or swallowed errors.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@uk_bin_collection/uk_bin_collection/councils/ExeterCityCouncil.py` around
lines 37 - 63, The code currently uses paon_num/paon_prefix and
label.strip().startswith(paon_prefix) which can erroneously match "5" to "52";
change the matching logic in the loop over data to extract the leading token
from label and compare it exactly (e.g. match the first token with a
word-boundary or exact token equality against paon_prefix) instead of using
startswith, and remove the silent fallback that assigns results_html =
data[0].get("Results"); if no exact match is found leave results_html unset so
the existing UPRN branch (check_uprn and the requests call) runs, and if both
PAON and UPRN branches fail raise a clear exception (or return an explicit
error) rather than returning bindata or silently returning the first result.


soup = BeautifulSoup(data[0]["Results"], "html.parser")
soup.prettify()
soup = BeautifulSoup(results_html, "html.parser")

# Extract bin schedule
for section in soup.find_all("h2"):
bin_type = section.text.strip()
collection_date = section.find_next("h3").text.strip()
h3 = section.find_next("h3")
if not h3:
continue
Comment on lines +69 to +71
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Fail loudly when a bin section has no date.

Skipping the section here silently drops a collection from the response. If Exeter changes the markup, this returns partial data instead of showing that the parser is broken.

Suggested fix
             h3 = section.find_next("h3")
             if not h3:
-                continue
+                raise ValueError(
+                    f"Missing collection date for Exeter bin section '{bin_type}'"
+                )
             collection_date = h3.text.strip()

Based on learnings: in uk_bin_collection/**/*.py, when parsing council bin collection data, prefer explicit failures (raise exceptions on unexpected formats) over silent defaults or swallowed errors.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@uk_bin_collection/uk_bin_collection/councils/ExeterCityCouncil.py` around
lines 69 - 71, The code silently skips a bin section when h3 =
section.find_next("h3") returns None, which can hide parsing breakages; in the
ExeterCityCouncil parser (the loop that inspects each section in class
ExeterCityCouncil / its parse method), replace the silent continue with an
explicit raised exception (e.g., ValueError or a CouncilParsingError) that
includes contextual info (the section HTML or an identifying message) so the
caller fails loudly when a collection date/title is missing; ensure the
exception references the failing symbol (section.find_next("h3")) and provides
clear context for debugging.

collection_date = h3.text.strip()

dict_data = {
"type": bin_type,
Expand Down
Loading