-
Notifications
You must be signed in to change notification settings - Fork 3.6k
[WEB-4172] feat: Crawl work item links for title and favicon #7117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
efbccea
feat: added a python bg task to crawl work item links for title and d…
sriramveeraghanta 9d097d7
fix: return meta_data in the response
sangeethailango 4e6958f
fix: add validation for accessing IP ranges
sangeethailango c75e92c
fix: remove json.dumps
sangeethailango cadaf86
fix: handle exception by returning None
sangeethailango 4ebbe00
refactor: call find_favicon_url inside fetch_and_encode_favicon function
sangeethailango e7bbedf
chore: type hints
sangeethailango aea4c32
fix: Handle None
sangeethailango fa63779
fix: remove print statementsg
sangeethailango 83128c2
chore: added favicon and title of links
JayashTripathy 5d1a6b1
fix: return null if no title found
sangeethailango bb5db0a
Update apiserver/plane/bgtasks/work_item_link_task.py
sangeethailango 3e4d1a4
fix: remove exception handling
sangeethailango 46808d5
fix: reduce timeout seconds
sangeethailango d87d2be
fix: handle timeout exception
sangeethailango ed18c07
fix: remove request timeout handling
sangeethailango d898d3e
Merge branch 'preview' into feat-crawl-work-item-link
sangeethailango b315e23
Merge branch 'feat-crawl-work-item-link' of https://github.com/makepl…
JayashTripathy e3d8f32
feat: add Link icon to issue detail links and update rendering logic
JayashTripathy 0bc2679
fix: use logger for exception
sangeethailango 3b8b340
Merge pull request #7130 from makeplane/crawl-links-ui
sangeethailango File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,185 @@ | ||
| # Python imports | ||
| import logging | ||
|
|
||
|
|
||
| # Third party imports | ||
| from celery import shared_task | ||
| import requests | ||
| from bs4 import BeautifulSoup | ||
| from urllib.parse import urlparse, urljoin | ||
| import base64 | ||
| import ipaddress | ||
| from typing import Dict, Any | ||
| from typing import Optional | ||
| from plane.db.models import IssueLink | ||
| from plane.utils.exception_logger import log_exception | ||
|
|
||
| logger = logging.getLogger("plane.worker") | ||
|
|
||
|
|
||
| DEFAULT_FAVICON = "PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiIGNsYXNzPSJsdWNpZGUgbHVjaWRlLWxpbmstaWNvbiBsdWNpZGUtbGluayI+PHBhdGggZD0iTTEwIDEzYTUgNSAwIDAgMCA3LjU0LjU0bDMtM2E1IDUgMCAwIDAtNy4wNy03LjA3bC0xLjcyIDEuNzEiLz48cGF0aCBkPSJNMTQgMTFhNSA1IDAgMCAwLTcuNTQtLjU0bC0zIDNhNSA1IDAgMCAwIDcuMDcgNy4wN2wxLjcxLTEuNzEiLz48L3N2Zz4=" # noqa: E501 | ||
|
|
||
|
|
||
| @shared_task | ||
| def crawl_work_item_link_title(id: str, url: str) -> None: | ||
| meta_data = crawl_work_item_link_title_and_favicon(url) | ||
| issue_link = IssueLink.objects.get(id=id) | ||
|
|
||
| issue_link.metadata = meta_data | ||
|
|
||
| issue_link.save() | ||
|
|
||
|
|
||
| def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]: | ||
| """ | ||
| Crawls a URL to extract the title and favicon. | ||
|
|
||
| Args: | ||
| url (str): The URL to crawl | ||
|
|
||
| Returns: | ||
| str: JSON string containing title and base64-encoded favicon | ||
| """ | ||
| try: | ||
| # Prevent access to private IP ranges | ||
| parsed = urlparse(url) | ||
|
|
||
| try: | ||
| ip = ipaddress.ip_address(parsed.hostname) | ||
| if ip.is_private or ip.is_loopback or ip.is_reserved: | ||
| raise ValueError("Access to private/internal networks is not allowed") | ||
| except ValueError: | ||
| # Not an IP address, continue with domain validation | ||
| pass | ||
|
|
||
sangeethailango marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Set up headers to mimic a real browser | ||
| headers = { | ||
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501 | ||
| } | ||
|
|
||
| # Fetch the main page | ||
| response = requests.get(url, headers=headers, timeout=2) | ||
|
|
||
| response.raise_for_status() | ||
|
|
||
| # Parse HTML | ||
| soup = BeautifulSoup(response.content, "html.parser") | ||
|
|
||
| # Extract title | ||
| title_tag = soup.find("title") | ||
| title = title_tag.get_text().strip() if title_tag else None | ||
|
|
||
| # Fetch and encode favicon | ||
| favicon_base64 = fetch_and_encode_favicon(headers, soup, url) | ||
|
|
||
| # Prepare result | ||
| result = { | ||
| "title": title, | ||
| "favicon": favicon_base64["favicon_base64"], | ||
| "url": url, | ||
| "favicon_url": favicon_base64["favicon_url"], | ||
| } | ||
|
|
||
| return result | ||
|
|
||
| except requests.RequestException as e: | ||
| log_exception(e) | ||
| return { | ||
| "error": f"Request failed: {str(e)}", | ||
| "title": None, | ||
| "favicon": None, | ||
| "url": url, | ||
| } | ||
| except Exception as e: | ||
| log_exception(e) | ||
| return { | ||
| "error": f"Unexpected error: {str(e)}", | ||
| "title": None, | ||
| "favicon": None, | ||
| "url": url, | ||
| } | ||
|
|
||
|
|
||
| def find_favicon_url(soup: BeautifulSoup, base_url: str) -> Optional[str]: | ||
| """ | ||
| Find the favicon URL from HTML soup. | ||
|
|
||
| Args: | ||
| soup: BeautifulSoup object | ||
| base_url: Base URL for resolving relative paths | ||
|
|
||
| Returns: | ||
| str: Absolute URL to favicon or None | ||
| """ | ||
| # Look for various favicon link tags | ||
| favicon_selectors = [ | ||
| 'link[rel="icon"]', | ||
| 'link[rel="shortcut icon"]', | ||
| 'link[rel="apple-touch-icon"]', | ||
| 'link[rel="apple-touch-icon-precomposed"]', | ||
| ] | ||
|
|
||
| for selector in favicon_selectors: | ||
| favicon_tag = soup.select_one(selector) | ||
| if favicon_tag and favicon_tag.get("href"): | ||
| return urljoin(base_url, favicon_tag["href"]) | ||
|
|
||
| # Fallback to /favicon.ico | ||
| parsed_url = urlparse(base_url) | ||
| fallback_url = f"{parsed_url.scheme}://{parsed_url.netloc}/favicon.ico" | ||
|
|
||
| # Check if fallback exists | ||
| try: | ||
| response = requests.head(fallback_url, timeout=2) | ||
| response.raise_for_status() | ||
| if response.status_code == 200: | ||
| return fallback_url | ||
| except requests.RequestException as e: | ||
| log_exception(e) | ||
| return None | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| def fetch_and_encode_favicon( | ||
| headers: Dict[str, str], soup: BeautifulSoup, url: str | ||
| ) -> Optional[Dict[str, str]]: | ||
| """ | ||
| Fetch favicon and encode it as base64. | ||
|
|
||
| Args: | ||
| favicon_url: URL to the favicon | ||
| headers: Request headers | ||
|
|
||
| Returns: | ||
| str: Base64 encoded favicon with data URI prefix or None | ||
| """ | ||
| try: | ||
| favicon_url = find_favicon_url(soup, url) | ||
| if favicon_url is None: | ||
| return { | ||
| "favicon_url": None, | ||
| "favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}", | ||
| } | ||
|
|
||
| response = requests.get(favicon_url, headers=headers, timeout=2) | ||
| response.raise_for_status() | ||
|
|
||
| # Get content type | ||
| content_type = response.headers.get("content-type", "image/x-icon") | ||
|
|
||
| # Convert to base64 | ||
| favicon_base64 = base64.b64encode(response.content).decode("utf-8") | ||
|
|
||
| # Return as data URI | ||
| return { | ||
| "favicon_url": favicon_url, | ||
| "favicon_base64": f"data:{content_type};base64,{favicon_base64}", | ||
| } | ||
|
|
||
| except Exception as e: | ||
| logger.warning(f"Failed to fetch favicon: {e}") | ||
| return { | ||
| "favicon_url": None, | ||
| "favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}", | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.