From 63f62c4a28e3e3ac5673c2b91fbdeea9d38405a1 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Fri, 13 Mar 2026 12:44:37 -0700 Subject: [PATCH] Addig openai invalid_prompt safety blocks as content filters --- .../openai/openai_error_handling.py | 7 +++++++ tests/unit/target/test_openai_error_handling.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/pyrit/prompt_target/openai/openai_error_handling.py b/pyrit/prompt_target/openai/openai_error_handling.py index db275ee934..be1d9484c4 100644 --- a/pyrit/prompt_target/openai/openai_error_handling.py +++ b/pyrit/prompt_target/openai/openai_error_handling.py @@ -77,6 +77,13 @@ def _is_content_filter_error(data: Union[dict[str, object], str]) -> bool: code = error_obj.get("code") if isinstance(error_obj, dict) else None if code in ["content_filter", "moderation_blocked"]: return True + # OpenAI uses "invalid_prompt" for model-level safety blocks (e.g. CBRN topics). + # Only treat it as a content filter when the message indicates a safety block, + # not for other invalid_prompt reasons (e.g. malformed schemas). + if code == "invalid_prompt": + message = error_obj.get("message", "") if isinstance(error_obj, dict) else "" + if "limited access" in str(message).lower() or "safety" in str(message).lower(): + return True # Heuristic: Azure sometimes uses other codes with policy-related content return "content_filter" in json.dumps(data).lower() # String-based heuristic search diff --git a/tests/unit/target/test_openai_error_handling.py b/tests/unit/target/test_openai_error_handling.py index e0f3ffe275..40429044b7 100644 --- a/tests/unit/target/test_openai_error_handling.py +++ b/tests/unit/target/test_openai_error_handling.py @@ -20,6 +20,23 @@ def test_is_content_filter_error_with_string(): assert _is_content_filter_error(error_str) is True +def test_is_content_filter_error_invalid_prompt_safety_block(): + """Test detection with invalid_prompt code and safety-related message (CBRN block)""" + data = { + "error": { + "code": "invalid_prompt", + "message": "Invalid prompt: we've limited access to this content for safety reasons.", + } + } + assert _is_content_filter_error(data) is True + + +def test_is_content_filter_error_invalid_prompt_non_safety(): + """Test that invalid_prompt without a safety message is NOT treated as a content filter""" + data = {"error": {"code": "invalid_prompt", "message": "Invalid prompt: schema validation failed."}} + assert _is_content_filter_error(data) is False + + def test_is_content_filter_error_no_filter(): """Test detection returns False when no content_filter""" error_dict = {"error": {"code": "rate_limit", "message": "Too many requests"}}