Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pyrit/prompt_target/openai/openai_error_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ def _is_content_filter_error(data: Union[dict[str, object], str]) -> bool:
code = error_obj.get("code") if isinstance(error_obj, dict) else None
if code in ["content_filter", "moderation_blocked"]:
return True
# OpenAI uses "invalid_prompt" for model-level safety blocks (e.g. CBRN topics).
# Only treat it as a content filter when the message indicates a safety block,
# not for other invalid_prompt reasons (e.g. malformed schemas).
if code == "invalid_prompt":
message = error_obj.get("message", "") if isinstance(error_obj, dict) else ""
Comment thread
rlundeen2 marked this conversation as resolved.
if "limited access" in str(message).lower() or "safety" in str(message).lower():
return True
# Heuristic: Azure sometimes uses other codes with policy-related content
return "content_filter" in json.dumps(data).lower()
# String-based heuristic search
Expand Down
17 changes: 17 additions & 0 deletions tests/unit/target/test_openai_error_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@ def test_is_content_filter_error_with_string():
assert _is_content_filter_error(error_str) is True


def test_is_content_filter_error_invalid_prompt_safety_block():
"""Test detection with invalid_prompt code and safety-related message (CBRN block)"""
data = {
"error": {
"code": "invalid_prompt",
"message": "Invalid prompt: we've limited access to this content for safety reasons.",
}
}
assert _is_content_filter_error(data) is True


def test_is_content_filter_error_invalid_prompt_non_safety():
"""Test that invalid_prompt without a safety message is NOT treated as a content filter"""
data = {"error": {"code": "invalid_prompt", "message": "Invalid prompt: schema validation failed."}}
assert _is_content_filter_error(data) is False


def test_is_content_filter_error_no_filter():
"""Test detection returns False when no content_filter"""
error_dict = {"error": {"code": "rate_limit", "message": "Too many requests"}}
Expand Down
Loading