diff --git a/apps/api/plane/api/serializers/issue.py b/apps/api/plane/api/serializers/issue.py index 69c35624658..075823cbfed 100644 --- a/apps/api/plane/api/serializers/issue.py +++ b/apps/api/plane/api/serializers/issue.py @@ -24,7 +24,6 @@ ) from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) @@ -89,20 +88,24 @@ def validate(self, data): raise serializers.ValidationError("Invalid HTML passed") # Validate description content for security - if data.get("description"): - is_valid, error_msg = validate_json_content(data["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - if data.get("description_html"): - is_valid, error_msg = validate_html_content(data["description_html"]) + is_valid, error_msg, sanitized_html = validate_html_content( + data["description_html"] + ) if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) + # Update the data with sanitized HTML if available + if sanitized_html is not None: + data["description_html"] = sanitized_html if data.get("description_binary"): is_valid, error_msg = validate_binary_data(data["description_binary"]) if not is_valid: - raise serializers.ValidationError({"description_binary": error_msg}) + raise serializers.ValidationError( + {"description_binary": "Invalid binary data"} + ) # Validate assignees are from project if data.get("assignees", []): diff --git a/apps/api/plane/api/serializers/project.py b/apps/api/plane/api/serializers/project.py index e6a257f3eb4..d860c46b2d2 100644 --- a/apps/api/plane/api/serializers/project.py +++ b/apps/api/plane/api/serializers/project.py @@ -12,7 +12,6 @@ from plane.utils.content_validator import ( validate_html_content, - validate_json_content, ) from .base import BaseSerializer @@ -200,27 +199,18 @@ def validate(self, data): ) # Validate description content for security - if "description" in data and data["description"]: - # For Project, description might be text field, not JSON - if isinstance(data["description"], dict): - is_valid, error_msg = validate_json_content(data["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - - if "description_text" in data and data["description_text"]: - is_valid, error_msg = validate_json_content(data["description_text"]) - if not is_valid: - raise serializers.ValidationError({"description_text": error_msg}) - if "description_html" in data and data["description_html"]: if isinstance(data["description_html"], dict): - is_valid, error_msg = validate_json_content(data["description_html"]) - else: - is_valid, error_msg = validate_html_content( + is_valid, error_msg, sanitized_html = validate_html_content( str(data["description_html"]) ) + # Update the data with sanitized HTML if available + if sanitized_html is not None: + data["description_html"] = sanitized_html if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) return data diff --git a/apps/api/plane/app/serializers/draft.py b/apps/api/plane/app/serializers/draft.py index 38fa65527b9..852caf8bf36 100644 --- a/apps/api/plane/app/serializers/draft.py +++ b/apps/api/plane/app/serializers/draft.py @@ -23,7 +23,6 @@ ) from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) from plane.app.permissions import ROLE @@ -76,20 +75,24 @@ def validate(self, attrs): raise serializers.ValidationError("Start date cannot exceed target date") # Validate description content for security - if "description" in attrs and attrs["description"]: - is_valid, error_msg = validate_json_content(attrs["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - if "description_html" in attrs and attrs["description_html"]: - is_valid, error_msg = validate_html_content(attrs["description_html"]) + is_valid, error_msg, sanitized_html = validate_html_content( + attrs["description_html"] + ) if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) + # Update the attrs with sanitized HTML if available + if sanitized_html is not None: + attrs["description_html"] = sanitized_html if "description_binary" in attrs and attrs["description_binary"]: is_valid, error_msg = validate_binary_data(attrs["description_binary"]) if not is_valid: - raise serializers.ValidationError({"description_binary": error_msg}) + raise serializers.ValidationError( + {"description_binary": "Invalid binary data"} + ) # Validate assignees are from project if attrs.get("assignee_ids", []): diff --git a/apps/api/plane/app/serializers/issue.py b/apps/api/plane/app/serializers/issue.py index 691140eba03..1eda376015b 100644 --- a/apps/api/plane/app/serializers/issue.py +++ b/apps/api/plane/app/serializers/issue.py @@ -43,7 +43,6 @@ ) from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) @@ -128,20 +127,24 @@ def validate(self, attrs): raise serializers.ValidationError("Start date cannot exceed target date") # Validate description content for security - if "description" in attrs and attrs["description"]: - is_valid, error_msg = validate_json_content(attrs["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - if "description_html" in attrs and attrs["description_html"]: - is_valid, error_msg = validate_html_content(attrs["description_html"]) + is_valid, error_msg, sanitized_html = validate_html_content( + attrs["description_html"] + ) if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) + # Update the attrs with sanitized HTML if available + if sanitized_html is not None: + attrs["description_html"] = sanitized_html if "description_binary" in attrs and attrs["description_binary"]: is_valid, error_msg = validate_binary_data(attrs["description_binary"]) if not is_valid: - raise serializers.ValidationError({"description_binary": error_msg}) + raise serializers.ValidationError( + {"description_binary": "Invalid binary data"} + ) # Validate assignees are from project if attrs.get("assignee_ids", []): diff --git a/apps/api/plane/app/serializers/page.py b/apps/api/plane/app/serializers/page.py index 78762e4b4e1..9ac6cc414f8 100644 --- a/apps/api/plane/app/serializers/page.py +++ b/apps/api/plane/app/serializers/page.py @@ -7,7 +7,6 @@ from plane.utils.content_validator import ( validate_binary_data, validate_html_content, - validate_json_content, ) from plane.db.models import ( Page, @@ -229,23 +228,13 @@ def validate_description_html(self, value): return value # Use the validation function from utils - is_valid, error_message = validate_html_content(value) + is_valid, error_message, sanitized_html = validate_html_content(value) if not is_valid: raise serializers.ValidationError(error_message) - return value + # Return sanitized HTML if available, otherwise return original + return sanitized_html if sanitized_html is not None else value - def validate_description(self, value): - """Validate the JSON description""" - if not value: - return value - - # Use the validation function from utils - is_valid, error_message = validate_json_content(value) - if not is_valid: - raise serializers.ValidationError(error_message) - - return value def update(self, instance, validated_data): """Update the page instance with validated data""" diff --git a/apps/api/plane/app/serializers/project.py b/apps/api/plane/app/serializers/project.py index dfa541d9f17..1d1ea927d1c 100644 --- a/apps/api/plane/app/serializers/project.py +++ b/apps/api/plane/app/serializers/project.py @@ -15,7 +15,6 @@ ) from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) @@ -65,27 +64,18 @@ def validate_identifier(self, identifier): def validate(self, data): # Validate description content for security - if "description" in data and data["description"]: - # For Project, description might be text field, not JSON - if isinstance(data["description"], dict): - is_valid, error_msg = validate_json_content(data["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - - if "description_text" in data and data["description_text"]: - is_valid, error_msg = validate_json_content(data["description_text"]) - if not is_valid: - raise serializers.ValidationError({"description_text": error_msg}) - if "description_html" in data and data["description_html"]: - if isinstance(data["description_html"], dict): - is_valid, error_msg = validate_json_content(data["description_html"]) - else: - is_valid, error_msg = validate_html_content( - str(data["description_html"]) - ) + is_valid, error_msg, sanitized_html = validate_html_content( + str(data["description_html"]) + ) + # Update the data with sanitized HTML if available + if sanitized_html is not None: + data["description_html"] = sanitized_html + if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) return data diff --git a/apps/api/plane/app/serializers/workspace.py b/apps/api/plane/app/serializers/workspace.py index ec4c4bf63e0..6b22f59e83d 100644 --- a/apps/api/plane/app/serializers/workspace.py +++ b/apps/api/plane/app/serializers/workspace.py @@ -26,7 +26,6 @@ from plane.utils.url import contains_url from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) @@ -319,20 +318,24 @@ class Meta: def validate(self, data): # Validate description content for security - if "description" in data and data["description"]: - is_valid, error_msg = validate_json_content(data["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - if "description_html" in data and data["description_html"]: - is_valid, error_msg = validate_html_content(data["description_html"]) + is_valid, error_msg, sanitized_html = validate_html_content( + data["description_html"] + ) if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) + # Update the data with sanitized HTML if available + if sanitized_html is not None: + data["description_html"] = sanitized_html if "description_binary" in data and data["description_binary"]: is_valid, error_msg = validate_binary_data(data["description_binary"]) if not is_valid: - raise serializers.ValidationError({"description_binary": error_msg}) + raise serializers.ValidationError( + {"description_binary": "Invalid binary data"} + ) return data diff --git a/apps/api/plane/space/serializer/issue.py b/apps/api/plane/space/serializer/issue.py index 3549e76262c..64f151a2d20 100644 --- a/apps/api/plane/space/serializer/issue.py +++ b/apps/api/plane/space/serializer/issue.py @@ -30,7 +30,6 @@ ) from plane.utils.content_validator import ( validate_html_content, - validate_json_content, validate_binary_data, ) @@ -290,20 +289,22 @@ def validate(self, data): raise serializers.ValidationError("Start date cannot exceed target date") # Validate description content for security - if "description" in data and data["description"]: - is_valid, error_msg = validate_json_content(data["description"]) - if not is_valid: - raise serializers.ValidationError({"description": error_msg}) - if "description_html" in data and data["description_html"]: - is_valid, error_msg = validate_html_content(data["description_html"]) + is_valid, error_msg, sanitized_html = validate_html_content( + data["description_html"] + ) if not is_valid: - raise serializers.ValidationError({"description_html": error_msg}) + raise serializers.ValidationError( + {"error": "html content is not valid"} + ) + # Update the data with sanitized HTML if available + if sanitized_html is not None: + data["description_html"] = sanitized_html if "description_binary" in data and data["description_binary"]: is_valid, error_msg = validate_binary_data(data["description_binary"]) if not is_valid: - raise serializers.ValidationError({"description_binary": error_msg}) + raise serializers.ValidationError({"description_binary": "Invalid binary data"}) return data diff --git a/apps/api/plane/utils/content_validator.py b/apps/api/plane/utils/content_validator.py index d28b83fc700..47ee663ffc1 100644 --- a/apps/api/plane/utils/content_validator.py +++ b/apps/api/plane/utils/content_validator.py @@ -1,36 +1,11 @@ # Python imports import base64 -import json -import re - +import nh3 +from plane.utils.exception_logger import log_exception # Maximum allowed size for binary data (10MB) MAX_SIZE = 10 * 1024 * 1024 -# Maximum recursion depth to prevent stack overflow -MAX_RECURSION_DEPTH = 20 - -# Dangerous text patterns that could indicate XSS or script injection -DANGEROUS_TEXT_PATTERNS = [ - r"]*>.*?", - r"javascript\s*:", - r"data\s*:\s*text/html", - r"eval\s*\(", - r"document\s*\.", - r"window\s*\.", - r"location\s*\.", -] - -# Dangerous attribute patterns for HTML attributes -DANGEROUS_ATTR_PATTERNS = [ - r"javascript\s*:", - r"data\s*:\s*text/html", - r"eval\s*\(", - r"alert\s*\(", - r"document\s*\.", - r"window\s*\.", -] - # Suspicious patterns for binary data content SUSPICIOUS_BINARY_PATTERNS = [ "]*>", - r"", - # JavaScript URLs in various attributes - r'(?:href|src|action)\s*=\s*["\']?\s*javascript:', - # Data URLs with text/html (potential XSS) - r'(?:href|src|action)\s*=\s*["\']?\s*data:text/html', - # Dangerous event handlers with JavaScript-like content - r'on(?:load|error|click|focus|blur|change|submit|reset|select|resize|scroll|unload|beforeunload|hashchange|popstate|storage|message|offline|online)\s*=\s*["\']?[^"\']*(?:javascript|alert|eval|document\.|window\.|location\.|history\.)[^"\']*["\']?', - # Object and embed tags that could load external content - r"<(?:object|embed)[^>]*(?:data|src)\s*=", - # Base tag that could change relative URL resolution - r"]*href\s*=", - # Dangerous iframe sources - r']*src\s*=\s*["\']?(?:javascript:|data:text/html)', - # Meta refresh redirects - r']*http-equiv\s*=\s*["\']?refresh["\']?', - # Link tags - simplified patterns - r']*rel\s*=\s*["\']?stylesheet["\']?', - r']*href\s*=\s*["\']?https?://', - r']*href\s*=\s*["\']?//', - r']*href\s*=\s*["\']?(?:data:|javascript:)', - # Style tags with external imports - r"]*>.*?@import.*?(?:https?://|//)", - # Link tags with dangerous rel types - r']*rel\s*=\s*["\']?(?:import|preload|prefetch|dns-prefetch|preconnect)["\']?', - # Forms with action attributes - r"]*action\s*=", -] - -# Dangerous JavaScript patterns for event handlers -DANGEROUS_JS_PATTERNS = [ - r"alert\s*\(", - r"eval\s*\(", - r"document\s*\.", - r"window\s*\.", - r"location\s*\.", - r"fetch\s*\(", - r"XMLHttpRequest", - r"innerHTML\s*=", - r"outerHTML\s*=", - r"document\.write", - r"script\s*>", -] - -# HTML self-closing tags that don't need closing tags -SELF_CLOSING_TAGS = { - "img", - "br", - "hr", - "input", - "meta", - "link", - "area", - "base", - "col", - "embed", - "source", - "track", - "wbr", -} - def validate_binary_data(data): """ @@ -149,191 +60,21 @@ def validate_binary_data(data): return True, None -def validate_html_content(html_content): +def validate_html_content(html_content: str): """ - Validate that HTML content is safe and doesn't contain malicious patterns. - - Args: - html_content (str): The HTML content to validate - - Returns: - tuple: (is_valid: bool, error_message: str or None) + Sanitize HTML content using nh3. + Returns a tuple: (is_valid, error_message, clean_html) """ if not html_content: - return True, None # Empty is OK + return True, None, None # Size check - 10MB limit (consistent with binary validation) if len(html_content.encode("utf-8")) > MAX_SIZE: - return False, "HTML content exceeds maximum size limit (10MB)" - - # Check for specific malicious patterns (simplified and more reliable) - for pattern in MALICIOUS_HTML_PATTERNS: - if re.search(pattern, html_content, re.IGNORECASE | re.DOTALL): - return ( - False, - f"HTML content contains potentially malicious patterns: {pattern}", - ) - - # Additional check for inline event handlers that contain suspicious content - # This is more permissive - only blocks if the event handler contains actual dangerous code - event_handler_pattern = r'on\w+\s*=\s*["\']([^"\']*)["\']' - event_matches = re.findall(event_handler_pattern, html_content, re.IGNORECASE) - - for handler_content in event_matches: - for js_pattern in DANGEROUS_JS_PATTERNS: - if re.search(js_pattern, handler_content, re.IGNORECASE): - return ( - False, - f"HTML content contains dangerous JavaScript in event handler: {handler_content[:100]}", - ) - - - return True, None - - -def validate_json_content(json_content): - """ - Validate that JSON content is safe and doesn't contain malicious patterns. - - Args: - json_content (dict): The JSON content to validate - - Returns: - tuple: (is_valid: bool, error_message: str or None) - """ - if not json_content: - return True, None # Empty is OK + return False, "HTML content exceeds maximum size limit (10MB)", None try: - # Size check - 10MB limit (consistent with other validations) - json_str = json.dumps(json_content) - if len(json_str.encode("utf-8")) > MAX_SIZE: - return False, "JSON content exceeds maximum size limit (10MB)" - - # Basic structure validation for page description JSON - if isinstance(json_content, dict): - # Check for expected page description structure - # This is based on ProseMirror/Tiptap JSON structure - if "type" in json_content and json_content.get("type") == "doc": - # Valid document structure - if "content" in json_content and isinstance( - json_content["content"], list - ): - # Recursively check content for suspicious patterns - is_valid, error_msg = _validate_json_content_array( - json_content["content"] - ) - if not is_valid: - return False, error_msg - elif "type" not in json_content and "content" not in json_content: - # Allow other JSON structures but validate for suspicious content - is_valid, error_msg = _validate_json_content_recursive(json_content) - if not is_valid: - return False, error_msg - else: - return False, "JSON description must be a valid object" - - except (TypeError, ValueError) as e: - return False, "Invalid JSON structure" + clean_html = nh3.clean(html_content) + return True, None, clean_html except Exception as e: - return False, "Failed to validate JSON content" - - return True, None - - -def _validate_json_content_array(content, depth=0): - """ - Validate JSON content array for suspicious patterns. - - Args: - content (list): Array of content nodes to validate - depth (int): Current recursion depth (default: 0) - - Returns: - tuple: (is_valid: bool, error_message: str or None) - """ - # Check recursion depth to prevent stack overflow - if depth > MAX_RECURSION_DEPTH: - return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded" - - if not isinstance(content, list): - return True, None - - for node in content: - if isinstance(node, dict): - # Check text content for suspicious patterns (more targeted) - if node.get("type") == "text" and "text" in node: - text_content = node["text"] - for pattern in DANGEROUS_TEXT_PATTERNS: - if re.search(pattern, text_content, re.IGNORECASE): - return ( - False, - "JSON content contains suspicious script patterns in text", - ) - - # Check attributes for suspicious content (more targeted) - if "attrs" in node and isinstance(node["attrs"], dict): - for attr_name, attr_value in node["attrs"].items(): - if isinstance(attr_value, str): - # Only check specific attributes that could be dangerous - if attr_name.lower() in [ - "href", - "src", - "action", - "onclick", - "onload", - "onerror", - ]: - for pattern in DANGEROUS_ATTR_PATTERNS: - if re.search(pattern, attr_value, re.IGNORECASE): - return ( - False, - f"JSON content contains dangerous pattern in {attr_name} attribute", - ) - - # Recursively check nested content - if "content" in node and isinstance(node["content"], list): - is_valid, error_msg = _validate_json_content_array( - node["content"], depth + 1 - ) - if not is_valid: - return False, error_msg - - return True, None - - -def _validate_json_content_recursive(obj, depth=0): - """ - Recursively validate JSON object for suspicious content. - - Args: - obj: JSON object (dict, list, or primitive) to validate - depth (int): Current recursion depth (default: 0) - - Returns: - tuple: (is_valid: bool, error_message: str or None) - """ - # Check recursion depth to prevent stack overflow - if depth > MAX_RECURSION_DEPTH: - return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded" - if isinstance(obj, dict): - for key, value in obj.items(): - if isinstance(value, str): - # Check for dangerous patterns using module constants - for pattern in DANGEROUS_TEXT_PATTERNS: - if re.search(pattern, value, re.IGNORECASE): - return ( - False, - "JSON content contains suspicious script patterns", - ) - elif isinstance(value, (dict, list)): - is_valid, error_msg = _validate_json_content_recursive(value, depth + 1) - if not is_valid: - return False, error_msg - elif isinstance(obj, list): - for item in obj: - is_valid, error_msg = _validate_json_content_recursive(item, depth + 1) - if not is_valid: - return False, error_msg - - return True, None + log_exception(e) + return False, "Failed to sanitize HTML", None diff --git a/apps/api/requirements/base.txt b/apps/api/requirements/base.txt index 39131632227..8b0a720d481 100644 --- a/apps/api/requirements/base.txt +++ b/apps/api/requirements/base.txt @@ -69,3 +69,5 @@ opentelemetry-instrumentation-django==0.49b1 opentelemetry-exporter-otlp==1.28.1 # OpenAPI Specification drf-spectacular==0.28.0 +# html sanitizer +nh3==0.2.18