hackIDLE · ethanolivertroy · May 14, 2026 · May 14, 2026 · chatgpt-codex-connector · May 14, 2026
@@ -214,6 +214,7 @@ python validate_api.py --require-current-schema --forbid-firecrawl-run-source
 | `CMVP_DB_PATH` | - | Path to cmvp.db for algorithm import (fastest override) |
 | `CERT_FETCH_CONCURRENCY` | `16` | Concurrent certificate detail page fetches |
 | `PDF_FETCH_CONCURRENCY` | `32` | Concurrent Security Policy PDF fetches/parses |
+| `CERT_PROCESS_TIMEOUT` | `900` | Per-certificate processing timeout in seconds |
 | `FULL_REFRESH` | `0` | Set to `1` to bypass reuse of previously generated outputs |
 
 When Crawl4AI is unavailable or cannot parse a policy PDF, the scraper falls back to local Security Policy PDF parsing.

@@ -20,6 +20,7 @@
                   If set, algorithm data will be imported from this database
     CERT_FETCH_CONCURRENCY: Concurrent certificate detail page fetches (default: 16)
     PDF_FETCH_CONCURRENCY: Concurrent Security Policy PDF fetches/parses (default: 32)
+    CERT_PROCESS_TIMEOUT: Per-certificate processing timeout in seconds (default: 900)
     FULL_REFRESH: Set to "1" to bypass reuse of previously generated outputs
 """
 
@@ -63,6 +64,7 @@
 SKIP_ALGORITHMS = os.getenv("SKIP_ALGORITHMS", "0") == "1"
 CERT_FETCH_CONCURRENCY = max(1, int(os.getenv("CERT_FETCH_CONCURRENCY", "16")))
 PDF_FETCH_CONCURRENCY = max(1, int(os.getenv("PDF_FETCH_CONCURRENCY", "32")))
+CERT_PROCESS_TIMEOUT = max(1, int(os.getenv("CERT_PROCESS_TIMEOUT", "900")))
 FULL_REFRESH = os.getenv("FULL_REFRESH", "0") == "1"
 
 # Path to NIST-CMVP-ReportGen database (if available for importing algorithms)
@@ -267,6 +269,7 @@
     "algorithm_source_security_policy_pdf",
     "algorithm_source_database",
     "algorithm_source_none",
+    "certificate_timeouts",
 )
 
 
@@ -393,6 +396,7 @@ def build_extraction_metrics(active_stats: Dict[str, int], historical_stats: Dic
         "concurrency": {
             "certificate_fetch": CERT_FETCH_CONCURRENCY,
             "security_policy_fetch": PDF_FETCH_CONCURRENCY,
+            "certificate_process_timeout_seconds": CERT_PROCESS_TIMEOUT,
         },
     }
 
@@ -1735,6 +1739,153 @@ async def process_certificate_record(
     return module_out, detail_payload, module_categories, stats
 
 
+def build_certificate_timeout_result(
+    module: Dict,
+    dataset: str,
+    generated_at: str,
+    algorithm_source: str,
+    previous_module: Optional[Dict],
+    previous_detail: Optional[Dict],
+    previous_metadata: Dict,
+) -> Tuple[Dict, Optional[Dict], List[str], Dict[str, int]]:
+    """Build a bounded fallback result when one certificate exceeds the timeout."""
+    stats = new_processing_stats()
+    stats["certificate_timeouts"] += 1
+
+    cert_number = parse_certificate_number(module)
+    module_out = dict(previous_module or {})
+    module_out.update(module)
+
+    source_url = module_out.get("security_policy_url")
+    if cert_number is not None and not source_url:
+        source_url = get_security_policy_url(cert_number)
+
+    categories, detailed = cached_algorithm_fields(previous_module, previous_detail)
+    attempt = {
+        "source": algorithm_source,
+        "url": str(source_url or ""),
+        "status": "timeout",
+    }
+
+    detail_payload: Optional[Dict] = None
+    if cert_number is not None and previous_detail:
+        detail_payload = prepare_reused_detail_payload(
+            previous_detail,
+            module,
+            cert_number,
+            dataset,
+            generated_at,
+        )
+        stats["html_reused"] += 1
+        for key in MODULE_DETAIL_FIELDS:
+            value = detail_payload.get(key)
+            if value not in (None, [], "", {}):
+                module_out[key] = value
+        module_out["security_policy_url"] = detail_payload.get("security_policy_url") or module_out.get("security_policy_url")
+        cached_source, cached_source_url = cached_algorithm_extraction_source(
+            previous_module,
+            previous_detail,
+            previous_metadata,
+        )
+        provenance = build_algorithm_extraction_provenance(
+            algorithm_source,
+            "cached" if categories or detailed else "miss",
+            cached_source if categories or detailed else "timeout",
+            cached_source_url or source_url,
+            categories,
+            detailed,
+            cached=bool(categories or detailed),
+            attempts=[attempt],
+        )
+        if categories or detailed:
+            stats["pdf_reused"] += 1
+            stats["algorithm_cache_hits"] += 1
+            stats["algorithm_successes"] += 1
+        else:
+            stats["algorithm_misses"] += 1
+        apply_algorithm_fields(detail_payload, categories, detailed)
+        apply_algorithm_extraction_provenance(detail_payload, provenance, include_attempts=True)
+        apply_algorithm_fields(module_out, categories, detailed)
+        apply_algorithm_extraction_provenance(module_out, provenance)
+        module_out["detail_available"] = True
+        return module_out, detail_payload, categories, stats
+
+    stats["html_failed"] += 1
+    if algorithm_source in CACHEABLE_ALGORITHM_SOURCES:
+        stats["pdf_failed"] += 1
+    if algorithm_source != "none":
+        stats["algorithm_misses"] += 1
+    strip_algorithm_fields(module_out)
+    provenance = build_algorithm_extraction_provenance(
+        algorithm_source,
+        "miss",
+        "timeout",
+        source_url,
+        [],
+        [],
+        attempts=[attempt],
+    )
+    apply_algorithm_extraction_provenance(module_out, provenance)
+    module_out["detail_available"] = False
+    return module_out, None, [], stats
+
+
+async def process_certificate_record_with_timeout(
+    index: int,
+    module: Dict,
+    dataset: str,
+    generated_at: str,
+    algorithm_source: str,
+    previous_module: Optional[Dict],
+    previous_detail: Optional[Dict],
+    previous_metadata: Dict,
+    client: httpx.AsyncClient,
+    cert_semaphore: asyncio.Semaphore,
+    pdf_semaphore: asyncio.Semaphore,
+    pdf_cache: Dict[str, asyncio.Task],
+    pdf_cache_lock: asyncio.Lock,
+    database_algorithms_map: Dict[int, List[str]],
+) -> Tuple[int, Dict, Optional[Dict], List[str], Dict[str, int]]:
+    """Process one certificate and return its input index, timing out slow records."""
+    try:
+        module_out, detail_payload, categories, stats = await asyncio.wait_for(
+            process_certificate_record(
+                module,
+                dataset,
+                generated_at,
+                algorithm_source,
+                previous_module,
+                previous_detail,
+                previous_metadata,
+                client,
+                cert_semaphore,
+                pdf_semaphore,
+                pdf_cache,
+                pdf_cache_lock,
+                database_algorithms_map,
+            ),
+            timeout=CERT_PROCESS_TIMEOUT,
+        )
+        return index, module_out, detail_payload, categories, stats
+    except asyncio.TimeoutError:
+        cert_number = parse_certificate_number(module)
+        print(
+            f"Warning: Timed out processing certificate {cert_number or 'unknown'} "
+            f"after {CERT_PROCESS_TIMEOUT}s; preserving cached data when available.",
+            file=sys.stderr,
+        )
+        module_out, detail_payload, categories, stats = build_certificate_timeout_result(
+            module,
+            dataset,
+            generated_at,
+            algorithm_source,
+            previous_module,
+            previous_detail,
+            previous_metadata,
+        )
+        return index, module_out, detail_payload, categories, stats
+
+
 async def build_certificate_artifacts(
     modules: List[Dict],
     dataset: str,
@@ -1770,7 +1921,8 @@ async def build_certificate_artifacts(
             cert_number = parse_certificate_number(module)
             tasks.append(
                 asyncio.create_task(
-                    process_certificate_record(
+                    process_certificate_record_with_timeout(
+                        index,
                         module,
                         dataset,
                         generated_at,
@@ -1790,8 +1942,8 @@ async def build_certificate_artifacts(
 
         total = len(tasks)
         completed = 0
-        for index, task in enumerate(tasks):
-            module_out, detail_payload, categories, task_stats = await task
+        for task in asyncio.as_completed(tasks):
+            index, module_out, detail_payload, categories, task_stats = await task
             completed += 1
             results[index] = module_out
             cert_number = parse_certificate_number(module_out)

@@ -10,6 +10,7 @@
 import tempfile
 from pathlib import Path
 from types import SimpleNamespace
+import scraper as scraper_module
 from scraper import (
     ALGORITHM_CACHE_VERSION,
     ALGORITHM_EXTRACTION_SCHEMA_VERSION,
@@ -744,14 +745,16 @@ def test_algorithm_extraction_provenance_and_metrics():
     assert provenance["detailed_algorithm_count"] == 2, "Detailed algorithm count mismatch"
     assert len(provenance["attempts"]) == 2, "Attempt provenance should be retained for detail records"
 
-    active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1}
+    active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1, "certificate_timeouts": 1}
     historical_stats = {"html_refreshed": 4, "algorithm_misses": 1}
     metrics = build_extraction_metrics(active_stats, historical_stats)
     assert metrics["combined"]["html_reused"] == 3, "Combined metrics should include active counters"
     assert metrics["combined"]["html_refreshed"] == 4, "Combined metrics should include historical counters"
     assert metrics["combined"]["algorithm_successes"] == 2, "Combined metrics should include successes"
     assert metrics["combined"]["algorithm_misses"] == 1, "Combined metrics should include misses"
+    assert metrics["combined"]["certificate_timeouts"] == 1, "Combined metrics should include certificate timeouts"
     assert "concurrency" in metrics, "Extraction metrics should record concurrency settings"
+    assert "certificate_process_timeout_seconds" in metrics["concurrency"], "Extraction metrics should record certificate timeout"
 
     print("✓ Algorithm provenance and metrics test passed")
 
@@ -863,6 +866,84 @@ def test_process_certificate_record_applies_cached_algorithm_provenance():
     print("✓ Cached algorithm provenance application test passed")
 
 
+def test_process_certificate_record_timeout_preserves_cached_data():
+    """Timed-out certificate work should preserve cached detail and algorithm payloads."""
+    module = {
+        "Certificate Number": "5238",
+        "Vendor Name": "SUSE LLC",
+        "Module Name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module",
+        "Module Type": "Software",
+        "Validation Date": "04/10/2026",
+        "security_policy_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
+        "certificate_detail_url": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/certificate/5238",
+    }
+    previous_detail = {
+        "certificate_number": "5238",
+        "dataset": "active",
+        "generated_at": "2026-04-01T00:00:00Z",
+        "nist_page_url": module["certificate_detail_url"],
+        "certificate_detail_url": module["certificate_detail_url"],
+        "security_policy_url": module["security_policy_url"],
+        "vendor_name": "SUSE LLC",
+        "module_name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module",
+        "software_versions": "3.0.9",
+        "hardware_versions": None,
+        "firmware_versions": None,
+        "algorithms": ["AES", "HMAC"],
+        "algorithms_detailed": ["AES-CBC A1", "HMAC SHA2-256 A1"],
+        "algorithm_extraction": {
+            "source": "crawl4ai",
+            "source_url": module["security_policy_url"],
+        },
+    }
+    previous_metadata = {
+        "algorithm_source": "crawl4ai",
+        "algorithm_cache_version": ALGORITHM_CACHE_VERSION,
+    }
+
+    async def slow_process(*args, **kwargs):
+        await asyncio.sleep(0.05)
+        raise AssertionError("timeout wrapper should not wait for slow process to finish")
+
+    original_process = scraper_module.process_certificate_record
+    original_timeout = scraper_module.CERT_PROCESS_TIMEOUT
+    scraper_module.process_certificate_record = slow_process
+    scraper_module.CERT_PROCESS_TIMEOUT = 0.01
+    try:
+        index, module_out, detail_payload, categories, stats = asyncio.run(
+            scraper_module.process_certificate_record_with_timeout(
+                7,
+                module,
+                "active",
+                "2026-04-12T03:10:00.961597Z",
+                "crawl4ai",
+                module,
+                previous_detail,
+                previous_metadata,
+                None,
+                asyncio.Semaphore(1),
+                asyncio.Semaphore(1),
+                {},
+                asyncio.Lock(),
+                {},
+            )
+        )
+    finally:
+        scraper_module.process_certificate_record = original_process
+        scraper_module.CERT_PROCESS_TIMEOUT = original_timeout
+
+    assert index == 7, "Timeout wrapper should preserve task index"
+    assert categories == ["AES", "HMAC"], "Timeout fallback should preserve cached categories"
+    assert module_out["detail_available"] is True, "Timeout fallback should preserve cached detail availability"
+    assert module_out["algorithm_extraction"]["status"] == "cached", "Timeout fallback should mark cached algorithms"
+    assert detail_payload["algorithm_extraction"]["attempts"][0]["status"] == "timeout", "Detail provenance should record timeout attempt"
+    assert stats["certificate_timeouts"] == 1, "Timeout fallback should increment certificate_timeouts"
+    assert stats["html_reused"] == 1, "Timeout fallback should reuse cached detail"
+    assert stats["algorithm_cache_hits"] == 1, "Timeout fallback should count cached algorithms"
+
+    print("✓ Certificate timeout fallback test passed")
+
+
 def test_prune_orphan_certificate_details():
     """Test that stale certificate detail files are removed only for missing certs."""
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -1185,6 +1266,7 @@ def main():
         test_algorithm_extraction_provenance_and_metrics()
         test_fetch_policy_pdf_bytes_reuses_in_run_cache()
         test_process_certificate_record_applies_cached_algorithm_provenance()
+        test_process_certificate_record_timeout_preserves_cached_data()
         test_prune_orphan_certificate_details()
         test_validate_generated_api_artifacts()
         test_build_certificate_index_payload()