Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions uk_bin_collection/uk_bin_collection/councils/BexleyCouncil.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,32 @@ class CouncilClass(AbstractGetBinDataClass):

def parse_data(self, page: str, **kwargs) -> dict:
user_uprn = kwargs.get("uprn")
check_uprn(user_uprn)

page = f"https://waste.bexley.gov.uk/waste/{user_uprn}"

headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
}

# First request may trigger async page generation; retry if content not ready
found = False
for attempt in range(3):
response = requests.get(page, headers=headers, timeout=30)
session = requests.Session()
session.headers.update(headers)

# Initial request triggers server-side schedule computation and sets session cookie
session.get(page, timeout=60)

# Poll for results — server computes async, JS client polls ?page_loading=1
# with x-requested-with header to get the content fragment
response = None
for attempt in range(30):
response = session.get(
f"{page}?page_loading=1",
headers={"x-requested-with": "fetch"},
timeout=60,
)
response.raise_for_status()
if "waste-service-name" in response.text:
found = True
break
time.sleep(3)

if not found:
raise ValueError("Bexley WasteWorks page did not return expected content after 3 attempts")
time.sleep(2)
Comment on lines +30 to +45
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Enforce a real 60-second overall deadline.

timeout=60 is per request, so this can block for far longer than 60 seconds once you include the bootstrap request, 30 polls, and sleeps. Track a deadline and cap each request by the remaining budget.

Proposed fix
-        # Initial request triggers server-side schedule computation and sets session cookie
-        session.get(page, timeout=60)
+        deadline = time.monotonic() + 60
+
+        # Initial request triggers server-side schedule computation and sets session cookie
+        bootstrap = session.get(page, timeout=10)
+        bootstrap.raise_for_status()

         # Poll for results — server computes async, JS client polls ?page_loading=1
         # with x-requested-with header to get the content fragment
         response = None
-        for attempt in range(30):
+        for _ in range(30):
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                raise TimeoutError("Timed out waiting for Bexley waste data to load")
             response = session.get(
                 f"{page}?page_loading=1",
                 headers={"x-requested-with": "fetch"},
-                timeout=60,
+                timeout=min(10, remaining),
             )
             response.raise_for_status()
             if "waste-service-name" in response.text:
                 break
-            time.sleep(2)
+            time.sleep(min(2, max(0, deadline - time.monotonic())))
🧰 Tools
🪛 Ruff (0.15.12)

[warning] 36-36: Loop control variable attempt not used within loop body

Rename unused attempt to _attempt

(B007)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@uk_bin_collection/uk_bin_collection/councils/BexleyCouncil.py` around lines
30 - 45, The current polling in BexleyCouncil.py uses timeout=60 per request
which can exceed a 60s overall deadline when combined with the initial
session.get, multiple polls and sleeps; change this to track a single deadline
(use time.monotonic()) set to now + 60s before the initial request, compute
remaining = deadline - now before each network call, and pass
timeout=min(remaining, 60) to session.get (for both the initial bootstrap
request and the "{page}?page_loading=1" polls), abort/raise a timeout if
remaining <= 0, and also cap the time.sleep() between attempts to not sleep past
the remaining budget. Ensure identifiers referenced are the existing session.get
calls and the polling loop around response =
session.get(f"{page}?page_loading=1", ...).

Comment thread
coderabbitai[bot] marked this conversation as resolved.

soup = BeautifulSoup(response.text, features="html.parser")

Expand All @@ -49,12 +55,8 @@ def parse_data(self, page: str, **kwargs) -> dict:
if not h3:
continue

# Get the main service name (first line of h3, before subtitle)
service_name_elem = h3.find("span") or h3
# Extract just the first text node for the bin type
bin_type = h3.get_text(separator="\n", strip=True).split("\n")[0]

# Find 'Next collection' in the summary list
summary_list = grid.find("dl", class_="govuk-summary-list")
if not summary_list:
continue
Expand All @@ -69,13 +71,14 @@ def parse_data(self, page: str, **kwargs) -> dict:

next_collection = dd.get_text(strip=True)
try:
# Parse date like "Wednesday 8 April 2026" or "Tuesday, 8th April 2026"
cleaned = remove_ordinal_indicator_from_date_string(next_collection)
# Try with comma format first
try:
parsed_date = datetime.strptime(cleaned, "%A, %d %B %Y")
except ValueError:
parsed_date = datetime.strptime(cleaned, "%A %d %B %Y")
if "collected today" in next_collection.lower() or "could not be collected today" in next_collection.lower():
parsed_date = datetime.now()
else:
cleaned = remove_ordinal_indicator_from_date_string(next_collection)
try:
parsed_date = datetime.strptime(cleaned, "%A, %d %B %Y")
except ValueError:
parsed_date = datetime.strptime(cleaned, "%A %d %B %Y")

data["bins"].append(
{
Expand All @@ -87,7 +90,4 @@ def parse_data(self, page: str, **kwargs) -> dict:
print(f"Error parsing date for {bin_type}: {e}")
break

if not data["bins"]:
raise ValueError("No collection dates found — page structure may have changed")

return data
Loading