From 937eb8ab2c3ab2cef730c194cb7af9e4c50495ed Mon Sep 17 00:00:00 2001 From: Ethan Troy <63926014+ethanolivertroy@users.noreply.github.com> Date: Thu, 14 May 2026 06:35:21 +0000 Subject: [PATCH] Bound certificate processing time --- README.md | 1 + scraper.py | 158 +++++++++++++++++++++++++++++++++++++++++++++++- test_scraper.py | 84 ++++++++++++++++++++++++- 3 files changed, 239 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 82ae48d80..320db69b2 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,7 @@ python validate_api.py --require-current-schema --forbid-firecrawl-run-source | `CMVP_DB_PATH` | - | Path to cmvp.db for algorithm import (fastest override) | | `CERT_FETCH_CONCURRENCY` | `16` | Concurrent certificate detail page fetches | | `PDF_FETCH_CONCURRENCY` | `32` | Concurrent Security Policy PDF fetches/parses | +| `CERT_PROCESS_TIMEOUT` | `900` | Per-certificate processing timeout in seconds | | `FULL_REFRESH` | `0` | Set to `1` to bypass reuse of previously generated outputs | When Crawl4AI is unavailable or cannot parse a policy PDF, the scraper falls back to local Security Policy PDF parsing. diff --git a/scraper.py b/scraper.py index 93a1697b1..bf44ac30d 100644 --- a/scraper.py +++ b/scraper.py @@ -20,6 +20,7 @@ If set, algorithm data will be imported from this database CERT_FETCH_CONCURRENCY: Concurrent certificate detail page fetches (default: 16) PDF_FETCH_CONCURRENCY: Concurrent Security Policy PDF fetches/parses (default: 32) + CERT_PROCESS_TIMEOUT: Per-certificate processing timeout in seconds (default: 900) FULL_REFRESH: Set to "1" to bypass reuse of previously generated outputs """ @@ -63,6 +64,7 @@ SKIP_ALGORITHMS = os.getenv("SKIP_ALGORITHMS", "0") == "1" CERT_FETCH_CONCURRENCY = max(1, int(os.getenv("CERT_FETCH_CONCURRENCY", "16"))) PDF_FETCH_CONCURRENCY = max(1, int(os.getenv("PDF_FETCH_CONCURRENCY", "32"))) +CERT_PROCESS_TIMEOUT = max(1, int(os.getenv("CERT_PROCESS_TIMEOUT", "900"))) FULL_REFRESH = os.getenv("FULL_REFRESH", "0") == "1" # Path to NIST-CMVP-ReportGen database (if available for importing algorithms) @@ -267,6 +269,7 @@ "algorithm_source_security_policy_pdf", "algorithm_source_database", "algorithm_source_none", + "certificate_timeouts", ) @@ -393,6 +396,7 @@ def build_extraction_metrics(active_stats: Dict[str, int], historical_stats: Dic "concurrency": { "certificate_fetch": CERT_FETCH_CONCURRENCY, "security_policy_fetch": PDF_FETCH_CONCURRENCY, + "certificate_process_timeout_seconds": CERT_PROCESS_TIMEOUT, }, } @@ -1735,6 +1739,153 @@ async def process_certificate_record( return module_out, detail_payload, module_categories, stats +def build_certificate_timeout_result( + module: Dict, + dataset: str, + generated_at: str, + algorithm_source: str, + previous_module: Optional[Dict], + previous_detail: Optional[Dict], + previous_metadata: Dict, +) -> Tuple[Dict, Optional[Dict], List[str], Dict[str, int]]: + """Build a bounded fallback result when one certificate exceeds the timeout.""" + stats = new_processing_stats() + stats["certificate_timeouts"] += 1 + + cert_number = parse_certificate_number(module) + module_out = dict(previous_module or {}) + module_out.update(module) + + source_url = module_out.get("security_policy_url") + if cert_number is not None and not source_url: + source_url = get_security_policy_url(cert_number) + + categories, detailed = cached_algorithm_fields(previous_module, previous_detail) + attempt = { + "source": algorithm_source, + "url": str(source_url or ""), + "status": "timeout", + } + + detail_payload: Optional[Dict] = None + if cert_number is not None and previous_detail: + detail_payload = prepare_reused_detail_payload( + previous_detail, + module, + cert_number, + dataset, + generated_at, + ) + stats["html_reused"] += 1 + for key in MODULE_DETAIL_FIELDS: + value = detail_payload.get(key) + if value not in (None, [], "", {}): + module_out[key] = value + module_out["security_policy_url"] = detail_payload.get("security_policy_url") or module_out.get("security_policy_url") + cached_source, cached_source_url = cached_algorithm_extraction_source( + previous_module, + previous_detail, + previous_metadata, + ) + provenance = build_algorithm_extraction_provenance( + algorithm_source, + "cached" if categories or detailed else "miss", + cached_source if categories or detailed else "timeout", + cached_source_url or source_url, + categories, + detailed, + cached=bool(categories or detailed), + attempts=[attempt], + ) + if categories or detailed: + stats["pdf_reused"] += 1 + stats["algorithm_cache_hits"] += 1 + stats["algorithm_successes"] += 1 + else: + stats["algorithm_misses"] += 1 + apply_algorithm_fields(detail_payload, categories, detailed) + apply_algorithm_extraction_provenance(detail_payload, provenance, include_attempts=True) + apply_algorithm_fields(module_out, categories, detailed) + apply_algorithm_extraction_provenance(module_out, provenance) + module_out["detail_available"] = True + return module_out, detail_payload, categories, stats + + stats["html_failed"] += 1 + if algorithm_source in CACHEABLE_ALGORITHM_SOURCES: + stats["pdf_failed"] += 1 + if algorithm_source != "none": + stats["algorithm_misses"] += 1 + strip_algorithm_fields(module_out) + provenance = build_algorithm_extraction_provenance( + algorithm_source, + "miss", + "timeout", + source_url, + [], + [], + attempts=[attempt], + ) + apply_algorithm_extraction_provenance(module_out, provenance) + module_out["detail_available"] = False + return module_out, None, [], stats + + +async def process_certificate_record_with_timeout( + index: int, + module: Dict, + dataset: str, + generated_at: str, + algorithm_source: str, + previous_module: Optional[Dict], + previous_detail: Optional[Dict], + previous_metadata: Dict, + client: httpx.AsyncClient, + cert_semaphore: asyncio.Semaphore, + pdf_semaphore: asyncio.Semaphore, + pdf_cache: Dict[str, asyncio.Task], + pdf_cache_lock: asyncio.Lock, + database_algorithms_map: Dict[int, List[str]], +) -> Tuple[int, Dict, Optional[Dict], List[str], Dict[str, int]]: + """Process one certificate and return its input index, timing out slow records.""" + try: + module_out, detail_payload, categories, stats = await asyncio.wait_for( + process_certificate_record( + module, + dataset, + generated_at, + algorithm_source, + previous_module, + previous_detail, + previous_metadata, + client, + cert_semaphore, + pdf_semaphore, + pdf_cache, + pdf_cache_lock, + database_algorithms_map, + ), + timeout=CERT_PROCESS_TIMEOUT, + ) + return index, module_out, detail_payload, categories, stats + except asyncio.TimeoutError: + cert_number = parse_certificate_number(module) + print( + f"Warning: Timed out processing certificate {cert_number or 'unknown'} " + f"after {CERT_PROCESS_TIMEOUT}s; preserving cached data when available.", + file=sys.stderr, + ) + module_out, detail_payload, categories, stats = build_certificate_timeout_result( + module, + dataset, + generated_at, + algorithm_source, + previous_module, + previous_detail, + previous_metadata, + ) + return index, module_out, detail_payload, categories, stats + + async def build_certificate_artifacts( modules: List[Dict], dataset: str, @@ -1770,7 +1921,8 @@ async def build_certificate_artifacts( cert_number = parse_certificate_number(module) tasks.append( asyncio.create_task( - process_certificate_record( + process_certificate_record_with_timeout( + index, module, dataset, generated_at, @@ -1790,8 +1942,8 @@ async def build_certificate_artifacts( total = len(tasks) completed = 0 - for index, task in enumerate(tasks): - module_out, detail_payload, categories, task_stats = await task + for task in asyncio.as_completed(tasks): + index, module_out, detail_payload, categories, task_stats = await task completed += 1 results[index] = module_out cert_number = parse_certificate_number(module_out) diff --git a/test_scraper.py b/test_scraper.py index 0b3814571..23b010a34 100644 --- a/test_scraper.py +++ b/test_scraper.py @@ -10,6 +10,7 @@ import tempfile from pathlib import Path from types import SimpleNamespace +import scraper as scraper_module from scraper import ( ALGORITHM_CACHE_VERSION, ALGORITHM_EXTRACTION_SCHEMA_VERSION, @@ -744,14 +745,16 @@ def test_algorithm_extraction_provenance_and_metrics(): assert provenance["detailed_algorithm_count"] == 2, "Detailed algorithm count mismatch" assert len(provenance["attempts"]) == 2, "Attempt provenance should be retained for detail records" - active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1} + active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1, "certificate_timeouts": 1} historical_stats = {"html_refreshed": 4, "algorithm_misses": 1} metrics = build_extraction_metrics(active_stats, historical_stats) assert metrics["combined"]["html_reused"] == 3, "Combined metrics should include active counters" assert metrics["combined"]["html_refreshed"] == 4, "Combined metrics should include historical counters" assert metrics["combined"]["algorithm_successes"] == 2, "Combined metrics should include successes" assert metrics["combined"]["algorithm_misses"] == 1, "Combined metrics should include misses" + assert metrics["combined"]["certificate_timeouts"] == 1, "Combined metrics should include certificate timeouts" assert "concurrency" in metrics, "Extraction metrics should record concurrency settings" + assert "certificate_process_timeout_seconds" in metrics["concurrency"], "Extraction metrics should record certificate timeout" print("✓ Algorithm provenance and metrics test passed") @@ -863,6 +866,84 @@ def test_process_certificate_record_applies_cached_algorithm_provenance(): print("✓ Cached algorithm provenance application test passed") +def test_process_certificate_record_timeout_preserves_cached_data(): + """Timed-out certificate work should preserve cached detail and algorithm payloads.""" + module = { + "Certificate Number": "5238", + "Vendor Name": "SUSE LLC", + "Module Name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module", + "Module Type": "Software", + "Validation Date": "04/10/2026", + "security_policy_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf", + "certificate_detail_url": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/certificate/5238", + } + previous_detail = { + "certificate_number": "5238", + "dataset": "active", + "generated_at": "2026-04-01T00:00:00Z", + "nist_page_url": module["certificate_detail_url"], + "certificate_detail_url": module["certificate_detail_url"], + "security_policy_url": module["security_policy_url"], + "vendor_name": "SUSE LLC", + "module_name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module", + "software_versions": "3.0.9", + "hardware_versions": None, + "firmware_versions": None, + "algorithms": ["AES", "HMAC"], + "algorithms_detailed": ["AES-CBC A1", "HMAC SHA2-256 A1"], + "algorithm_extraction": { + "source": "crawl4ai", + "source_url": module["security_policy_url"], + }, + } + previous_metadata = { + "algorithm_source": "crawl4ai", + "algorithm_cache_version": ALGORITHM_CACHE_VERSION, + } + + async def slow_process(*args, **kwargs): + await asyncio.sleep(0.05) + raise AssertionError("timeout wrapper should not wait for slow process to finish") + + original_process = scraper_module.process_certificate_record + original_timeout = scraper_module.CERT_PROCESS_TIMEOUT + scraper_module.process_certificate_record = slow_process + scraper_module.CERT_PROCESS_TIMEOUT = 0.01 + try: + index, module_out, detail_payload, categories, stats = asyncio.run( + scraper_module.process_certificate_record_with_timeout( + 7, + module, + "active", + "2026-04-12T03:10:00.961597Z", + "crawl4ai", + module, + previous_detail, + previous_metadata, + None, + asyncio.Semaphore(1), + asyncio.Semaphore(1), + {}, + asyncio.Lock(), + {}, + ) + ) + finally: + scraper_module.process_certificate_record = original_process + scraper_module.CERT_PROCESS_TIMEOUT = original_timeout + + assert index == 7, "Timeout wrapper should preserve task index" + assert categories == ["AES", "HMAC"], "Timeout fallback should preserve cached categories" + assert module_out["detail_available"] is True, "Timeout fallback should preserve cached detail availability" + assert module_out["algorithm_extraction"]["status"] == "cached", "Timeout fallback should mark cached algorithms" + assert detail_payload["algorithm_extraction"]["attempts"][0]["status"] == "timeout", "Detail provenance should record timeout attempt" + assert stats["certificate_timeouts"] == 1, "Timeout fallback should increment certificate_timeouts" + assert stats["html_reused"] == 1, "Timeout fallback should reuse cached detail" + assert stats["algorithm_cache_hits"] == 1, "Timeout fallback should count cached algorithms" + + print("✓ Certificate timeout fallback test passed") + + def test_prune_orphan_certificate_details(): """Test that stale certificate detail files are removed only for missing certs.""" with tempfile.TemporaryDirectory() as temp_dir: @@ -1185,6 +1266,7 @@ def main(): test_algorithm_extraction_provenance_and_metrics() test_fetch_policy_pdf_bytes_reuses_in_run_cache() test_process_certificate_record_applies_cached_algorithm_provenance() + test_process_certificate_record_timeout_preserves_cached_data() test_prune_orphan_certificate_details() test_validate_generated_api_artifacts() test_build_certificate_index_payload()