{
  "data": {
    "edges": [
      {
        "animated": false,
        "className": "",
        "data": {
          "sourceHandle": {
            "dataType": "URLComponent",
            "id": "URLComponent-ZszYV",
            "name": "raw_results",
            "output_types": [
              "Message"
            ]
          },
          "targetHandle": {
            "fieldName": "data_inputs",
            "id": "SplitText-HYi2s",
            "inputTypes": [
              "Data",
              "DataFrame",
              "Message"
            ],
            "type": "other"
          }
        },
        "id": "xy-edge__URLComponent-ZszYV{œdataTypeœ:œURLComponentœ,œidœ:œURLComponent-ZszYVœ,œnameœ:œraw_resultsœ,œoutput_typesœ:[œMessageœ]}-SplitText-HYi2s{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-HYi2sœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}",
        "selected": false,
        "source": "URLComponent-ZszYV",
        "sourceHandle": "{œdataTypeœ:œURLComponentœ,œidœ:œURLComponent-ZszYVœ,œnameœ:œraw_resultsœ,œoutput_typesœ:[œMessageœ]}",
        "target": "SplitText-HYi2s",
        "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-HYi2sœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}"
      }
    ],
    "nodes": [
      {
        "data": {
          "id": "URLComponent-ZszYV",
          "node": {
            "base_classes": [
              "DataFrame",
              "Message"
            ],
            "beta": false,
            "conditional_paths": [],
            "custom_fields": {},
            "description": "Fetch content from one or more web pages, following links recursively.",
            "display_name": "URL",
            "documentation": "https://docs.langflow.org/components-data#url",
            "edited": false,
            "field_order": [
              "urls",
              "max_depth",
              "prevent_outside",
              "use_async",
              "format",
              "timeout",
              "headers",
              "filter_text_html",
              "continue_on_failure",
              "check_response_status",
              "autoset_encoding"
            ],
            "frozen": false,
            "icon": "layout-template",
            "legacy": false,
            "lf_version": "1.6.0",
            "metadata": {
              "code_hash": "252132357639",
              "dependencies": {
                "dependencies": [
                  {
                    "name": "requests",
                    "version": "2.32.5"
                  },
                  {
                    "name": "bs4",
                    "version": "4.12.3"
                  },
                  {
                    "name": "langchain_community",
                    "version": "0.3.21"
                  },
                  {
                    "name": "langflow",
                    "version": null
                  }
                ],
                "total_dependencies": 4
              },
              "module": "langflow.components.data.url.URLComponent"
            },
            "minimized": false,
            "output_types": [],
            "outputs": [
              {
                "allows_loop": false,
                "cache": true,
                "display_name": "Extracted Pages",
                "group_outputs": false,
                "method": "fetch_content",
                "name": "page_results",
                "tool_mode": true,
                "types": [
                  "DataFrame"
                ],
                "value": "__UNDEFINED__"
              },
              {
                "allows_loop": false,
                "cache": true,
                "display_name": "Raw Content",
                "group_outputs": false,
                "method": "fetch_content_as_message",
                "name": "raw_results",
                "selected": "Message",
                "tool_mode": false,
                "types": [
                  "Message"
                ],
                "value": "__UNDEFINED__"
              }
            ],
            "pinned": false,
            "template": {
              "_type": "Component",
              "autoset_encoding": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Autoset Encoding",
                "dynamic": false,
                "info": "If enabled, automatically sets the encoding of the request.",
                "list": false,
                "list_add_label": "Add More",
                "name": "autoset_encoding",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": true
              },
              "check_response_status": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Check Response Status",
                "dynamic": false,
                "info": "If enabled, checks the response status of the request.",
                "list": false,
                "list_add_label": "Add More",
                "name": "check_response_status",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": false
              },
              "code": {
                "advanced": true,
                "dynamic": true,
                "fileTypes": [],
                "file_path": "",
                "info": "",
                "list": false,
                "load_from_db": false,
                "multiline": true,
                "name": "code",
                "password": false,
                "placeholder": "",
                "required": true,
                "show": true,
                "title_case": false,
                "type": "code",
                "value": "import re\n\nimport requests\nfrom bs4 import BeautifulSoup\nfrom langchain_community.document_loaders import RecursiveUrlLoader\n\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.field_typing.range_spec import RangeSpec\nfrom langflow.helpers.data import safe_convert\nfrom langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output, SliderInput, TableInput\nfrom langflow.logging.logger import logger\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\nfrom langflow.services.deps import get_settings_service\n\n# Constants\nDEFAULT_TIMEOUT = 30\nDEFAULT_MAX_DEPTH = 1\nDEFAULT_FORMAT = \"Text\"\nURL_REGEX = re.compile(\n    r\"^(https?:\\/\\/)?\" r\"(www\\.)?\" r\"([a-zA-Z0-9.-]+)\" r\"(\\.[a-zA-Z]{2,})?\" r\"(:\\d+)?\" r\"(\\/[^\\s]*)?$\",\n    re.IGNORECASE,\n)\n\n\nclass URLComponent(Component):\n    \"\"\"A component that loads and parses content from web pages recursively.\n\n    This component allows fetching content from one or more URLs, with options to:\n    - Control crawl depth\n    - Prevent crawling outside the root domain\n    - Use async loading for better performance\n    - Extract either raw HTML or clean text\n    - Configure request headers and timeouts\n    \"\"\"\n\n    display_name = \"URL\"\n    description = \"Fetch content from one or more web pages, following links recursively.\"\n    documentation: str = \"https://docs.langflow.org/components-data#url\"\n    icon = \"layout-template\"\n    name = \"URLComponent\"\n\n    inputs = [\n        MessageTextInput(\n            name=\"urls\",\n            display_name=\"URLs\",\n            info=\"Enter one or more URLs to crawl recursively, by clicking the '+' button.\",\n            is_list=True,\n            tool_mode=True,\n            placeholder=\"Enter a URL...\",\n            list_add_label=\"Add URL\",\n            input_types=[],\n        ),\n        SliderInput(\n            name=\"max_depth\",\n            display_name=\"Depth\",\n            info=(\n                \"Controls how many 'clicks' away from the initial page the crawler will go:\\n\"\n                \"- depth 1: only the initial page\\n\"\n                \"- depth 2: initial page + all pages linked directly from it\\n\"\n                \"- depth 3: initial page + direct links + links found on those direct link pages\\n\"\n                \"Note: This is about link traversal, not URL path depth.\"\n            ),\n            value=DEFAULT_MAX_DEPTH,\n            range_spec=RangeSpec(min=1, max=5, step=1),\n            required=False,\n            min_label=\" \",\n            max_label=\" \",\n            min_label_icon=\"None\",\n            max_label_icon=\"None\",\n            # slider_input=True\n        ),\n        BoolInput(\n            name=\"prevent_outside\",\n            display_name=\"Prevent Outside\",\n            info=(\n                \"If enabled, only crawls URLs within the same domain as the root URL. \"\n                \"This helps prevent the crawler from going to external websites.\"\n            ),\n            value=True,\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"use_async\",\n            display_name=\"Use Async\",\n            info=(\n                \"If enabled, uses asynchronous loading which can be significantly faster \"\n                \"but might use more system resources.\"\n            ),\n            value=True,\n            required=False,\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"format\",\n            display_name=\"Output Format\",\n            info=\"Output Format. Use 'Text' to extract the text from the HTML or 'HTML' for the raw HTML content.\",\n            options=[\"Text\", \"HTML\"],\n            value=DEFAULT_FORMAT,\n            advanced=True,\n        ),\n        IntInput(\n            name=\"timeout\",\n            display_name=\"Timeout\",\n            info=\"Timeout for the request in seconds.\",\n            value=DEFAULT_TIMEOUT,\n            required=False,\n            advanced=True,\n        ),\n        TableInput(\n            name=\"headers\",\n            display_name=\"Headers\",\n            info=\"The headers to send with the request\",\n            table_schema=[\n                {\n                    \"name\": \"key\",\n                    \"display_name\": \"Header\",\n                    \"type\": \"str\",\n                    \"description\": \"Header name\",\n                },\n                {\n                    \"name\": \"value\",\n                    \"display_name\": \"Value\",\n                    \"type\": \"str\",\n                    \"description\": \"Header value\",\n                },\n            ],\n            value=[{\"key\": \"User-Agent\", \"value\": get_settings_service().settings.user_agent}],\n            advanced=True,\n            input_types=[\"DataFrame\"],\n        ),\n        BoolInput(\n            name=\"filter_text_html\",\n            display_name=\"Filter Text/HTML\",\n            info=\"If enabled, filters out text/css content type from the results.\",\n            value=True,\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"continue_on_failure\",\n            display_name=\"Continue on Failure\",\n            info=\"If enabled, continues crawling even if some requests fail.\",\n            value=True,\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"check_response_status\",\n            display_name=\"Check Response Status\",\n            info=\"If enabled, checks the response status of the request.\",\n            value=False,\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"autoset_encoding\",\n            display_name=\"Autoset Encoding\",\n            info=\"If enabled, automatically sets the encoding of the request.\",\n            value=True,\n            required=False,\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Extracted Pages\", name=\"page_results\", method=\"fetch_content\"),\n        Output(display_name=\"Raw Content\", name=\"raw_results\", method=\"fetch_content_as_message\", tool_mode=False),\n    ]\n\n    @staticmethod\n    def validate_url(url: str) -> bool:\n        \"\"\"Validates if the given string matches URL pattern.\n\n        Args:\n            url: The URL string to validate\n\n        Returns:\n            bool: True if the URL is valid, False otherwise\n        \"\"\"\n        return bool(URL_REGEX.match(url))\n\n    def ensure_url(self, url: str) -> str:\n        \"\"\"Ensures the given string is a valid URL.\n\n        Args:\n            url: The URL string to validate and normalize\n\n        Returns:\n            str: The normalized URL\n\n        Raises:\n            ValueError: If the URL is invalid\n        \"\"\"\n        url = url.strip()\n        if not url.startswith((\"http://\", \"https://\")):\n            url = \"https://\" + url\n\n        if not self.validate_url(url):\n            msg = f\"Invalid URL: {url}\"\n            raise ValueError(msg)\n\n        return url\n\n    def _create_loader(self, url: str) -> RecursiveUrlLoader:\n        \"\"\"Creates a RecursiveUrlLoader instance with the configured settings.\n\n        Args:\n            url: The URL to load\n\n        Returns:\n            RecursiveUrlLoader: Configured loader instance\n        \"\"\"\n        headers_dict = {header[\"key\"]: header[\"value\"] for header in self.headers}\n        extractor = (lambda x: x) if self.format == \"HTML\" else (lambda x: BeautifulSoup(x, \"lxml\").get_text())\n\n        return RecursiveUrlLoader(\n            url=url,\n            max_depth=self.max_depth,\n            prevent_outside=self.prevent_outside,\n            use_async=self.use_async,\n            extractor=extractor,\n            timeout=self.timeout,\n            headers=headers_dict,\n            check_response_status=self.check_response_status,\n            continue_on_failure=self.continue_on_failure,\n            base_url=url,  # Add base_url to ensure consistent domain crawling\n            autoset_encoding=self.autoset_encoding,  # Enable automatic encoding detection\n            exclude_dirs=[],  # Allow customization of excluded directories\n            link_regex=None,  # Allow customization of link filtering\n        )\n\n    def fetch_url_contents(self) -> list[dict]:\n        \"\"\"Load documents from the configured URLs.\n\n        Returns:\n            List[Data]: List of Data objects containing the fetched content\n\n        Raises:\n            ValueError: If no valid URLs are provided or if there's an error loading documents\n        \"\"\"\n        try:\n            urls = list({self.ensure_url(url) for url in self.urls if url.strip()})\n            logger.debug(f\"URLs: {urls}\")\n            if not urls:\n                msg = \"No valid URLs provided.\"\n                raise ValueError(msg)\n\n            all_docs = []\n            for url in urls:\n                logger.debug(f\"Loading documents from {url}\")\n\n                try:\n                    loader = self._create_loader(url)\n                    docs = loader.load()\n\n                    if not docs:\n                        logger.warning(f\"No documents found for {url}\")\n                        continue\n\n                    logger.debug(f\"Found {len(docs)} documents from {url}\")\n                    all_docs.extend(docs)\n\n                except requests.exceptions.RequestException as e:\n                    logger.exception(f\"Error loading documents from {url}: {e}\")\n                    continue\n\n            if not all_docs:\n                msg = \"No documents were successfully loaded from any URL\"\n                raise ValueError(msg)\n\n            # data = [Data(text=doc.page_content, **doc.metadata) for doc in all_docs]\n            data = [\n                {\n                    \"text\": safe_convert(doc.page_content, clean_data=True),\n                    \"url\": doc.metadata.get(\"source\", \"\"),\n                    \"title\": doc.metadata.get(\"title\", \"\"),\n                    \"description\": doc.metadata.get(\"description\", \"\"),\n                    \"content_type\": doc.metadata.get(\"content_type\", \"\"),\n                    \"language\": doc.metadata.get(\"language\", \"\"),\n                }\n                for doc in all_docs\n            ]\n        except Exception as e:\n            error_msg = e.message if hasattr(e, \"message\") else e\n            msg = f\"Error loading documents: {error_msg!s}\"\n            logger.exception(msg)\n            raise ValueError(msg) from e\n        return data\n\n    def fetch_content(self) -> DataFrame:\n        \"\"\"Convert the documents to a DataFrame.\"\"\"\n        return DataFrame(data=self.fetch_url_contents())\n\n    def fetch_content_as_message(self) -> Message:\n        \"\"\"Convert the documents to a Message.\"\"\"\n        url_contents = self.fetch_url_contents()\n        return Message(text=\"\\n\\n\".join([x[\"text\"] for x in url_contents]), data={\"data\": url_contents})\n"
              },
              "continue_on_failure": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Continue on Failure",
                "dynamic": false,
                "info": "If enabled, continues crawling even if some requests fail.",
                "list": false,
                "list_add_label": "Add More",
                "name": "continue_on_failure",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": true
              },
              "filter_text_html": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Filter Text/HTML",
                "dynamic": false,
                "info": "If enabled, filters out text/css content type from the results.",
                "list": false,
                "list_add_label": "Add More",
                "name": "filter_text_html",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": true
              },
              "format": {
                "_input_type": "DropdownInput",
                "advanced": true,
                "combobox": false,
                "dialog_inputs": {},
                "display_name": "Output Format",
                "dynamic": false,
                "external_options": {},
                "info": "Output Format. Use 'Text' to extract the text from the HTML or 'HTML' for the raw HTML content.",
                "name": "format",
                "options": [
                  "Text",
                  "HTML"
                ],
                "options_metadata": [],
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "toggle": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "str",
                "value": "Text"
              },
              "headers": {
                "_input_type": "TableInput",
                "advanced": true,
                "display_name": "Headers",
                "dynamic": false,
                "info": "The headers to send with the request",
                "input_types": [
                  "DataFrame"
                ],
                "is_list": true,
                "list_add_label": "Add More",
                "name": "headers",
                "placeholder": "",
                "required": false,
                "show": true,
                "table_icon": "Table",
                "table_schema": {
                  "columns": [
                    {
                      "default": "None",
                      "description": "Header name",
                      "disable_edit": false,
                      "display_name": "Header",
                      "edit_mode": "popover",
                      "filterable": true,
                      "formatter": "text",
                      "hidden": false,
                      "name": "key",
                      "sortable": true,
                      "type": "str"
                    },
                    {
                      "default": "None",
                      "description": "Header value",
                      "disable_edit": false,
                      "display_name": "Value",
                      "edit_mode": "popover",
                      "filterable": true,
                      "formatter": "text",
                      "hidden": false,
                      "name": "value",
                      "sortable": true,
                      "type": "str"
                    }
                  ]
                },
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "trigger_icon": "Table",
                "trigger_text": "Open table",
                "type": "table",
                "value": [
                  {
                    "key": "User-Agent",
                    "value": "langflow"
                  }
                ]
              },
              "max_depth": {
                "_input_type": "SliderInput",
                "advanced": false,
                "display_name": "Depth",
                "dynamic": false,
                "info": "Controls how many 'clicks' away from the initial page the crawler will go:\n- depth 1: only the initial page\n- depth 2: initial page + all pages linked directly from it\n- depth 3: initial page + direct links + links found on those direct link pages\nNote: This is about link traversal, not URL path depth.",
                "max_label": " ",
                "max_label_icon": "None",
                "min_label": " ",
                "min_label_icon": "None",
                "name": "max_depth",
                "placeholder": "",
                "range_spec": {
                  "max": 5,
                  "min": 1,
                  "step": 1,
                  "step_type": "float"
                },
                "required": false,
                "show": true,
                "slider_buttons": false,
                "slider_buttons_options": [],
                "slider_input": false,
                "title_case": false,
                "tool_mode": false,
                "type": "slider",
                "value": 1
              },
              "prevent_outside": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Prevent Outside",
                "dynamic": false,
                "info": "If enabled, only crawls URLs within the same domain as the root URL. This helps prevent the crawler from going to external websites.",
                "list": false,
                "list_add_label": "Add More",
                "name": "prevent_outside",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": true
              },
              "timeout": {
                "_input_type": "IntInput",
                "advanced": true,
                "display_name": "Timeout",
                "dynamic": false,
                "info": "Timeout for the request in seconds.",
                "list": false,
                "list_add_label": "Add More",
                "name": "timeout",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "int",
                "value": 30
              },
              "urls": {
                "_input_type": "MessageTextInput",
                "advanced": false,
                "display_name": "URLs",
                "dynamic": false,
                "info": "Enter one or more URLs to crawl recursively, by clicking the '+' button.",
                "input_types": [],
                "list": true,
                "list_add_label": "Add URL",
                "load_from_db": false,
                "name": "urls",
                "placeholder": "Enter a URL...",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": true,
                "trace_as_input": true,
                "trace_as_metadata": true,
                "type": "str",
                "value": [
                  "www.langflow.org"
                ]
              },
              "use_async": {
                "_input_type": "BoolInput",
                "advanced": true,
                "display_name": "Use Async",
                "dynamic": false,
                "info": "If enabled, uses asynchronous loading which can be significantly faster but might use more system resources.",
                "list": false,
                "list_add_label": "Add More",
                "name": "use_async",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "bool",
                "value": true
              }
            },
            "tool_mode": false
          },
          "selected_output": "raw_results",
          "showNode": true,
          "type": "URLComponent"
        },
        "dragging": false,
        "id": "URLComponent-ZszYV",
        "measured": {
          "height": 291,
          "width": 320
        },
        "position": {
          "x": 201,
          "y": 136.0028419494629
        },
        "selected": true,
        "type": "genericNode"
      },
      {
        "data": {
          "id": "SplitText-HYi2s",
          "node": {
            "base_classes": [
              "DataFrame"
            ],
            "beta": false,
            "conditional_paths": [],
            "custom_fields": {},
            "description": "Split text into chunks based on specified criteria.",
            "display_name": "Split Text",
            "documentation": "https://docs.langflow.org/components-processing#split-text",
            "edited": false,
            "field_order": [
              "data_inputs",
              "chunk_overlap",
              "chunk_size",
              "separator",
              "text_key",
              "keep_separator"
            ],
            "frozen": false,
            "icon": "scissors-line-dashed",
            "legacy": false,
            "metadata": {
              "code_hash": "dbf2e9d2319d",
              "dependencies": {
                "dependencies": [
                  {
                    "name": "langchain_text_splitters",
                    "version": "0.3.9"
                  },
                  {
                    "name": "langflow",
                    "version": null
                  }
                ],
                "total_dependencies": 2
              },
              "module": "langflow.components.processing.split_text.SplitTextComponent"
            },
            "minimized": false,
            "output_types": [],
            "outputs": [
              {
                "allows_loop": false,
                "cache": true,
                "display_name": "Chunks",
                "group_outputs": false,
                "method": "split_text",
                "name": "dataframe",
                "selected": "DataFrame",
                "tool_mode": true,
                "types": [
                  "DataFrame"
                ],
                "value": "__UNDEFINED__"
              }
            ],
            "pinned": false,
            "template": {
              "_type": "Component",
              "chunk_overlap": {
                "_input_type": "IntInput",
                "advanced": false,
                "display_name": "Chunk Overlap",
                "dynamic": false,
                "info": "Number of characters to overlap between chunks.",
                "list": false,
                "list_add_label": "Add More",
                "name": "chunk_overlap",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "int",
                "value": 200
              },
              "chunk_size": {
                "_input_type": "IntInput",
                "advanced": false,
                "display_name": "Chunk Size",
                "dynamic": false,
                "info": "The maximum length of each chunk. Text is first split by separator, then chunks are merged up to this size. Individual splits larger than this won't be further divided.",
                "list": false,
                "list_add_label": "Add More",
                "name": "chunk_size",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "int",
                "value": 1000
              },
              "code": {
                "advanced": true,
                "dynamic": true,
                "fileTypes": [],
                "file_path": "",
                "info": "",
                "list": false,
                "load_from_db": false,
                "multiline": true,
                "name": "code",
                "password": false,
                "placeholder": "",
                "required": true,
                "show": true,
                "title_case": false,
                "type": "code",
                "value": "from langchain_text_splitters import CharacterTextSplitter\n\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\nfrom langflow.utils.util import unescape_string\n\n\nclass SplitTextComponent(Component):\n    display_name: str = \"Split Text\"\n    description: str = \"Split text into chunks based on specified criteria.\"\n    documentation: str = \"https://docs.langflow.org/components-processing#split-text\"\n    icon = \"scissors-line-dashed\"\n    name = \"SplitText\"\n\n    inputs = [\n        HandleInput(\n            name=\"data_inputs\",\n            display_name=\"Input\",\n            info=\"The data with texts to split in chunks.\",\n            input_types=[\"Data\", \"DataFrame\", \"Message\"],\n            required=True,\n        ),\n        IntInput(\n            name=\"chunk_overlap\",\n            display_name=\"Chunk Overlap\",\n            info=\"Number of characters to overlap between chunks.\",\n            value=200,\n        ),\n        IntInput(\n            name=\"chunk_size\",\n            display_name=\"Chunk Size\",\n            info=(\n                \"The maximum length of each chunk. Text is first split by separator, \"\n                \"then chunks are merged up to this size. \"\n                \"Individual splits larger than this won't be further divided.\"\n            ),\n            value=1000,\n        ),\n        MessageTextInput(\n            name=\"separator\",\n            display_name=\"Separator\",\n            info=(\n                \"The character to split on. Use \\\\n for newline. \"\n                \"Examples: \\\\n\\\\n for paragraphs, \\\\n for lines, . for sentences\"\n            ),\n            value=\"\\n\",\n        ),\n        MessageTextInput(\n            name=\"text_key\",\n            display_name=\"Text Key\",\n            info=\"The key to use for the text column.\",\n            value=\"text\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"keep_separator\",\n            display_name=\"Keep Separator\",\n            info=\"Whether to keep the separator in the output chunks and where to place it.\",\n            options=[\"False\", \"True\", \"Start\", \"End\"],\n            value=\"False\",\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Chunks\", name=\"dataframe\", method=\"split_text\"),\n    ]\n\n    def _docs_to_data(self, docs) -> list[Data]:\n        return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]\n\n    def _fix_separator(self, separator: str) -> str:\n        \"\"\"Fix common separator issues and convert to proper format.\"\"\"\n        if separator == \"/n\":\n            return \"\\n\"\n        if separator == \"/t\":\n            return \"\\t\"\n        return separator\n\n    def split_text_base(self):\n        separator = self._fix_separator(self.separator)\n        separator = unescape_string(separator)\n\n        if isinstance(self.data_inputs, DataFrame):\n            if not len(self.data_inputs):\n                msg = \"DataFrame is empty\"\n                raise TypeError(msg)\n\n            self.data_inputs.text_key = self.text_key\n            try:\n                documents = self.data_inputs.to_lc_documents()\n            except Exception as e:\n                msg = f\"Error converting DataFrame to documents: {e}\"\n                raise TypeError(msg) from e\n        elif isinstance(self.data_inputs, Message):\n            self.data_inputs = [self.data_inputs.to_data()]\n            return self.split_text_base()\n        else:\n            if not self.data_inputs:\n                msg = \"No data inputs provided\"\n                raise TypeError(msg)\n\n            documents = []\n            if isinstance(self.data_inputs, Data):\n                self.data_inputs.text_key = self.text_key\n                documents = [self.data_inputs.to_lc_document()]\n            else:\n                try:\n                    documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]\n                    if not documents:\n                        msg = f\"No valid Data inputs found in {type(self.data_inputs)}\"\n                        raise TypeError(msg)\n                except AttributeError as e:\n                    msg = f\"Invalid input type in collection: {e}\"\n                    raise TypeError(msg) from e\n        try:\n            # Convert string 'False'/'True' to boolean\n            keep_sep = self.keep_separator\n            if isinstance(keep_sep, str):\n                if keep_sep.lower() == \"false\":\n                    keep_sep = False\n                elif keep_sep.lower() == \"true\":\n                    keep_sep = True\n                # 'start' and 'end' are kept as strings\n\n            splitter = CharacterTextSplitter(\n                chunk_overlap=self.chunk_overlap,\n                chunk_size=self.chunk_size,\n                separator=separator,\n                keep_separator=keep_sep,\n            )\n            return splitter.split_documents(documents)\n        except Exception as e:\n            msg = f\"Error splitting text: {e}\"\n            raise TypeError(msg) from e\n\n    def split_text(self) -> DataFrame:\n        return DataFrame(self._docs_to_data(self.split_text_base()))\n"
              },
              "data_inputs": {
                "_input_type": "HandleInput",
                "advanced": false,
                "display_name": "Input",
                "dynamic": false,
                "info": "The data with texts to split in chunks.",
                "input_types": [
                  "Data",
                  "DataFrame",
                  "Message"
                ],
                "list": false,
                "list_add_label": "Add More",
                "name": "data_inputs",
                "placeholder": "",
                "required": true,
                "show": true,
                "title_case": false,
                "trace_as_metadata": true,
                "type": "other",
                "value": ""
              },
              "keep_separator": {
                "_input_type": "DropdownInput",
                "advanced": true,
                "combobox": false,
                "dialog_inputs": {},
                "display_name": "Keep Separator",
                "dynamic": false,
                "external_options": {},
                "info": "Whether to keep the separator in the output chunks and where to place it.",
                "name": "keep_separator",
                "options": [
                  "False",
                  "True",
                  "Start",
                  "End"
                ],
                "options_metadata": [],
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "toggle": false,
                "tool_mode": false,
                "trace_as_metadata": true,
                "type": "str",
                "value": "False"
              },
              "separator": {
                "_input_type": "MessageTextInput",
                "advanced": false,
                "display_name": "Separator",
                "dynamic": false,
                "info": "The character to split on. Use \\n for newline. Examples: \\n\\n for paragraphs, \\n for lines, . for sentences",
                "input_types": [
                  "Message"
                ],
                "list": false,
                "list_add_label": "Add More",
                "load_from_db": false,
                "name": "separator",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_input": true,
                "trace_as_metadata": true,
                "type": "str",
                "value": ""
              },
              "text_key": {
                "_input_type": "MessageTextInput",
                "advanced": true,
                "display_name": "Text Key",
                "dynamic": false,
                "info": "The key to use for the text column.",
                "input_types": [
                  "Message"
                ],
                "list": false,
                "list_add_label": "Add More",
                "load_from_db": false,
                "name": "text_key",
                "placeholder": "",
                "required": false,
                "show": true,
                "title_case": false,
                "tool_mode": false,
                "trace_as_input": true,
                "trace_as_metadata": true,
                "type": "str",
                "value": "text"
              }
            },
            "tool_mode": false
          },
          "showNode": true,
          "type": "SplitText"
        },
        "dragging": false,
        "id": "SplitText-HYi2s",
        "measured": {
          "height": 412,
          "width": 320
        },
        "position": {
          "x": 647,
          "y": 80.00284194946289
        },
        "selected": false,
        "type": "genericNode"
      }
    ],
    "viewport": {
      "x": -16.92843911490928,
      "y": 52.166290984503576,
      "zoom": 0.85748409862485
    }
  },
  "description": "Building Intelligent Interactions.",
  "endpoint_name": null,
  "id": "86a0c794-7c71-4b21-84d6-9ba055b195b1",
  "is_component": false,
  "last_tested_version": "1.6.0",
  "name": "New Flow (1)",
  "tags": []
}