Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ python validate_api.py --require-current-schema --forbid-firecrawl-run-source
| `CMVP_DB_PATH` | - | Path to cmvp.db for algorithm import (fastest override) |
| `CERT_FETCH_CONCURRENCY` | `16` | Concurrent certificate detail page fetches |
| `PDF_FETCH_CONCURRENCY` | `32` | Concurrent Security Policy PDF fetches/parses |
| `CERT_PROCESS_TIMEOUT` | `900` | Per-certificate processing timeout in seconds |
| `FULL_REFRESH` | `0` | Set to `1` to bypass reuse of previously generated outputs |

When Crawl4AI is unavailable or cannot parse a policy PDF, the scraper falls back to local Security Policy PDF parsing.
Expand Down
158 changes: 155 additions & 3 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
If set, algorithm data will be imported from this database
CERT_FETCH_CONCURRENCY: Concurrent certificate detail page fetches (default: 16)
PDF_FETCH_CONCURRENCY: Concurrent Security Policy PDF fetches/parses (default: 32)
CERT_PROCESS_TIMEOUT: Per-certificate processing timeout in seconds (default: 900)
FULL_REFRESH: Set to "1" to bypass reuse of previously generated outputs
"""

Expand Down Expand Up @@ -63,6 +64,7 @@
SKIP_ALGORITHMS = os.getenv("SKIP_ALGORITHMS", "0") == "1"
CERT_FETCH_CONCURRENCY = max(1, int(os.getenv("CERT_FETCH_CONCURRENCY", "16")))
PDF_FETCH_CONCURRENCY = max(1, int(os.getenv("PDF_FETCH_CONCURRENCY", "32")))
CERT_PROCESS_TIMEOUT = max(1, int(os.getenv("CERT_PROCESS_TIMEOUT", "900")))
FULL_REFRESH = os.getenv("FULL_REFRESH", "0") == "1"

# Path to NIST-CMVP-ReportGen database (if available for importing algorithms)
Expand Down Expand Up @@ -267,6 +269,7 @@
"algorithm_source_security_policy_pdf",
"algorithm_source_database",
"algorithm_source_none",
"certificate_timeouts",
)


Expand Down Expand Up @@ -393,6 +396,7 @@ def build_extraction_metrics(active_stats: Dict[str, int], historical_stats: Dic
"concurrency": {
"certificate_fetch": CERT_FETCH_CONCURRENCY,
"security_policy_fetch": PDF_FETCH_CONCURRENCY,
"certificate_process_timeout_seconds": CERT_PROCESS_TIMEOUT,
},
}

Expand Down Expand Up @@ -1735,6 +1739,153 @@ async def process_certificate_record(
return module_out, detail_payload, module_categories, stats


def build_certificate_timeout_result(
module: Dict,
dataset: str,
generated_at: str,
algorithm_source: str,
previous_module: Optional[Dict],
previous_detail: Optional[Dict],
previous_metadata: Dict,
) -> Tuple[Dict, Optional[Dict], List[str], Dict[str, int]]:
"""Build a bounded fallback result when one certificate exceeds the timeout."""
stats = new_processing_stats()
stats["certificate_timeouts"] += 1

cert_number = parse_certificate_number(module)
module_out = dict(previous_module or {})
module_out.update(module)

source_url = module_out.get("security_policy_url")
if cert_number is not None and not source_url:
source_url = get_security_policy_url(cert_number)

categories, detailed = cached_algorithm_fields(previous_module, previous_detail)
attempt = {
Comment on lines +1763 to +1764
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Honor ALGORITHM_SOURCE=none in timeout fallback

When a certificate times out and cached detail exists, this branch always pulls cached algorithms and later reapplies them, regardless of the configured algorithm_source. In the ALGORITHM_SOURCE=none mode, a timeout will therefore repopulate algorithms/algorithms_detailed and mark extraction as cached instead of skipped, which contradicts the explicit skip setting and yields inconsistent API payloads/metrics for timeouted records.

Useful? React with 👍 / 👎.

"source": algorithm_source,
"url": str(source_url or ""),
"status": "timeout",
}

detail_payload: Optional[Dict] = None
if cert_number is not None and previous_detail:
detail_payload = prepare_reused_detail_payload(
previous_detail,
module,
cert_number,
dataset,
generated_at,
)
stats["html_reused"] += 1
for key in MODULE_DETAIL_FIELDS:
value = detail_payload.get(key)
if value not in (None, [], "", {}):
module_out[key] = value
module_out["security_policy_url"] = detail_payload.get("security_policy_url") or module_out.get("security_policy_url")
cached_source, cached_source_url = cached_algorithm_extraction_source(
previous_module,
previous_detail,
previous_metadata,
)
provenance = build_algorithm_extraction_provenance(
algorithm_source,
"cached" if categories or detailed else "miss",
cached_source if categories or detailed else "timeout",
cached_source_url or source_url,
categories,
detailed,
cached=bool(categories or detailed),
attempts=[attempt],
)
if categories or detailed:
stats["pdf_reused"] += 1
stats["algorithm_cache_hits"] += 1
stats["algorithm_successes"] += 1
else:
stats["algorithm_misses"] += 1
apply_algorithm_fields(detail_payload, categories, detailed)
apply_algorithm_extraction_provenance(detail_payload, provenance, include_attempts=True)
apply_algorithm_fields(module_out, categories, detailed)
apply_algorithm_extraction_provenance(module_out, provenance)
module_out["detail_available"] = True
return module_out, detail_payload, categories, stats

stats["html_failed"] += 1
if algorithm_source in CACHEABLE_ALGORITHM_SOURCES:
stats["pdf_failed"] += 1
if algorithm_source != "none":
stats["algorithm_misses"] += 1
strip_algorithm_fields(module_out)
provenance = build_algorithm_extraction_provenance(
algorithm_source,
"miss",
"timeout",
source_url,
[],
[],
attempts=[attempt],
)
apply_algorithm_extraction_provenance(module_out, provenance)
module_out["detail_available"] = False
return module_out, None, [], stats
Comment on lines +1763 to +1830
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action required

1. Cached algorithms dropped 🐞 Bug ≡ Correctness

build_certificate_timeout_result() strips algorithm fields and returns empty categories when
previous_detail is missing, even if previous_module contains cached algorithms. This causes
timed-out certificates to lose previously extracted algorithm data loaded from modules.json.
Agent Prompt
## Issue description
In `build_certificate_timeout_result()`, cached algorithm fields are computed from `(previous_module, previous_detail)` but are only applied/returned when `previous_detail` exists. If the detail JSON is missing but the cached module row contains algorithms, the timeout fallback currently calls `strip_algorithm_fields(module_out)` and returns `[]`, discarding available cached algorithms.

## Issue Context
`load_previous_outputs()` loads cached module rows from `api/modules.json` and certificate detail payloads separately from `DETAIL_DIR`. It is therefore valid for `previous_module` to exist while `previous_detail` is absent; the timeout fallback should still preserve cached algorithm fields from `previous_module` in that case.

## Fix Focus Areas
- scraper.py[830-860]
- scraper.py[1742-1830]

### Suggested implementation direction
- On the `previous_detail is None` timeout path, if `categories`/`detailed` from `cached_algorithm_fields(previous_module, None)` are non-empty:
  - apply them to `module_out` via `apply_algorithm_fields(...)`
  - build provenance with a status like `cached` (or a clear timeout-specific cached status) and include the timeout attempt
  - increment the same cache-hit counters used in the `previous_detail` branch (`pdf_reused`, `algorithm_cache_hits`, `algorithm_successes`) as appropriate
  - return those categories instead of `[]`
- Keep `detail_available` as `False` and `detail_payload` as `None` when there is no cached detail payload file.

ⓘ Copy this prompt and use it to remediate the issue with your preferred AI generation tools



async def process_certificate_record_with_timeout(
index: int,
module: Dict,
dataset: str,
generated_at: str,
algorithm_source: str,
previous_module: Optional[Dict],
previous_detail: Optional[Dict],
previous_metadata: Dict,
client: httpx.AsyncClient,
cert_semaphore: asyncio.Semaphore,
pdf_semaphore: asyncio.Semaphore,
pdf_cache: Dict[str, asyncio.Task],
pdf_cache_lock: asyncio.Lock,
database_algorithms_map: Dict[int, List[str]],
) -> Tuple[int, Dict, Optional[Dict], List[str], Dict[str, int]]:
"""Process one certificate and return its input index, timing out slow records."""
try:
module_out, detail_payload, categories, stats = await asyncio.wait_for(
process_certificate_record(
module,
dataset,
generated_at,
Comment on lines +1851 to +1855
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Handle cancellation from timed out certificate tasks

Using asyncio.wait_for here cancels process_certificate_record on timeout; if that coroutine is awaiting a shared PDF-cache task, cancellation propagates to the shared task. Other certificate workers awaiting the same cached task can then receive CancelledError (not TimeoutError), and this wrapper does not catch it, so build_certificate_artifacts can fail instead of emitting the timeout fallback. This is reproducible when multiple records share a policy URL and one reaches the timeout boundary first.

Useful? React with 👍 / 👎.

algorithm_source,
previous_module,
previous_detail,
previous_metadata,
client,
cert_semaphore,
pdf_semaphore,
pdf_cache,
pdf_cache_lock,
database_algorithms_map,
),
timeout=CERT_PROCESS_TIMEOUT,
)
return index, module_out, detail_payload, categories, stats
except asyncio.TimeoutError:
cert_number = parse_certificate_number(module)
print(
f"Warning: Timed out processing certificate {cert_number or 'unknown'} "
f"after {CERT_PROCESS_TIMEOUT}s; preserving cached data when available.",
file=sys.stderr,
)
module_out, detail_payload, categories, stats = build_certificate_timeout_result(
module,
dataset,
generated_at,
algorithm_source,
previous_module,
previous_detail,
previous_metadata,
)
return index, module_out, detail_payload, categories, stats


async def build_certificate_artifacts(
modules: List[Dict],
dataset: str,
Expand Down Expand Up @@ -1770,7 +1921,8 @@ async def build_certificate_artifacts(
cert_number = parse_certificate_number(module)
tasks.append(
asyncio.create_task(
process_certificate_record(
process_certificate_record_with_timeout(
index,
module,
dataset,
generated_at,
Expand All @@ -1790,8 +1942,8 @@ async def build_certificate_artifacts(

total = len(tasks)
completed = 0
for index, task in enumerate(tasks):
module_out, detail_payload, categories, task_stats = await task
for task in asyncio.as_completed(tasks):
index, module_out, detail_payload, categories, task_stats = await task
completed += 1
results[index] = module_out
cert_number = parse_certificate_number(module_out)
Expand Down
84 changes: 83 additions & 1 deletion test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import tempfile
from pathlib import Path
from types import SimpleNamespace
import scraper as scraper_module
from scraper import (
ALGORITHM_CACHE_VERSION,
ALGORITHM_EXTRACTION_SCHEMA_VERSION,
Expand Down Expand Up @@ -744,14 +745,16 @@ def test_algorithm_extraction_provenance_and_metrics():
assert provenance["detailed_algorithm_count"] == 2, "Detailed algorithm count mismatch"
assert len(provenance["attempts"]) == 2, "Attempt provenance should be retained for detail records"

active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1}
active_stats = {"html_reused": 3, "algorithm_successes": 2, "algorithm_fallbacks": 1, "certificate_timeouts": 1}
historical_stats = {"html_refreshed": 4, "algorithm_misses": 1}
metrics = build_extraction_metrics(active_stats, historical_stats)
assert metrics["combined"]["html_reused"] == 3, "Combined metrics should include active counters"
assert metrics["combined"]["html_refreshed"] == 4, "Combined metrics should include historical counters"
assert metrics["combined"]["algorithm_successes"] == 2, "Combined metrics should include successes"
assert metrics["combined"]["algorithm_misses"] == 1, "Combined metrics should include misses"
assert metrics["combined"]["certificate_timeouts"] == 1, "Combined metrics should include certificate timeouts"
assert "concurrency" in metrics, "Extraction metrics should record concurrency settings"
assert "certificate_process_timeout_seconds" in metrics["concurrency"], "Extraction metrics should record certificate timeout"

print("✓ Algorithm provenance and metrics test passed")

Expand Down Expand Up @@ -863,6 +866,84 @@ def test_process_certificate_record_applies_cached_algorithm_provenance():
print("✓ Cached algorithm provenance application test passed")


def test_process_certificate_record_timeout_preserves_cached_data():
"""Timed-out certificate work should preserve cached detail and algorithm payloads."""
module = {
"Certificate Number": "5238",
"Vendor Name": "SUSE LLC",
"Module Name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module",
"Module Type": "Software",
"Validation Date": "04/10/2026",
"security_policy_url": "https://csrc.nist.gov/CSRC/media/projects/cryptographic-module-validation-program/documents/security-policies/140sp5238.pdf",
"certificate_detail_url": "https://csrc.nist.gov/projects/cryptographic-module-validation-program/certificate/5238",
}
previous_detail = {
"certificate_number": "5238",
"dataset": "active",
"generated_at": "2026-04-01T00:00:00Z",
"nist_page_url": module["certificate_detail_url"],
"certificate_detail_url": module["certificate_detail_url"],
"security_policy_url": module["security_policy_url"],
"vendor_name": "SUSE LLC",
"module_name": "SUSE Linux Enterprise OpenSSL 1 Cryptographic Module",
"software_versions": "3.0.9",
"hardware_versions": None,
"firmware_versions": None,
"algorithms": ["AES", "HMAC"],
"algorithms_detailed": ["AES-CBC A1", "HMAC SHA2-256 A1"],
"algorithm_extraction": {
"source": "crawl4ai",
"source_url": module["security_policy_url"],
},
}
previous_metadata = {
"algorithm_source": "crawl4ai",
"algorithm_cache_version": ALGORITHM_CACHE_VERSION,
}

async def slow_process(*args, **kwargs):
await asyncio.sleep(0.05)
raise AssertionError("timeout wrapper should not wait for slow process to finish")

original_process = scraper_module.process_certificate_record
original_timeout = scraper_module.CERT_PROCESS_TIMEOUT
scraper_module.process_certificate_record = slow_process
scraper_module.CERT_PROCESS_TIMEOUT = 0.01
try:
index, module_out, detail_payload, categories, stats = asyncio.run(
scraper_module.process_certificate_record_with_timeout(
7,
module,
"active",
"2026-04-12T03:10:00.961597Z",
"crawl4ai",
module,
previous_detail,
previous_metadata,
None,
asyncio.Semaphore(1),
asyncio.Semaphore(1),
{},
asyncio.Lock(),
{},
)
)
finally:
scraper_module.process_certificate_record = original_process
scraper_module.CERT_PROCESS_TIMEOUT = original_timeout

assert index == 7, "Timeout wrapper should preserve task index"
assert categories == ["AES", "HMAC"], "Timeout fallback should preserve cached categories"
assert module_out["detail_available"] is True, "Timeout fallback should preserve cached detail availability"
assert module_out["algorithm_extraction"]["status"] == "cached", "Timeout fallback should mark cached algorithms"
assert detail_payload["algorithm_extraction"]["attempts"][0]["status"] == "timeout", "Detail provenance should record timeout attempt"
assert stats["certificate_timeouts"] == 1, "Timeout fallback should increment certificate_timeouts"
assert stats["html_reused"] == 1, "Timeout fallback should reuse cached detail"
assert stats["algorithm_cache_hits"] == 1, "Timeout fallback should count cached algorithms"

print("✓ Certificate timeout fallback test passed")


def test_prune_orphan_certificate_details():
"""Test that stale certificate detail files are removed only for missing certs."""
with tempfile.TemporaryDirectory() as temp_dir:
Expand Down Expand Up @@ -1185,6 +1266,7 @@ def main():
test_algorithm_extraction_provenance_and_metrics()
test_fetch_policy_pdf_bytes_reuses_in_run_cache()
test_process_certificate_record_applies_cached_algorithm_provenance()
test_process_certificate_record_timeout_preserves_cached_data()
test_prune_orphan_certificate_details()
test_validate_generated_api_artifacts()
test_build_certificate_index_payload()
Expand Down