diff --git a/.auxiliary/notes/issues.md b/.auxiliary/notes/issues.md index 1e36a7e..c07ba43 100644 --- a/.auxiliary/notes/issues.md +++ b/.auxiliary/notes/issues.md @@ -1,3 +1,89 @@ # Librovore Issues and Enhancement Opportunities -No open issues at this time. +## SSL/TLS Certificate Verification Failure + +**Date Reported**: 2025-11-19 +**Component**: Sphinx inventory processor (urllib-based inventory download) +**Severity**: Medium (blocks testing with some sites) + +### Issue Description + +When attempting to fetch Sphinx object inventories from certain sites (e.g., `docs.twistedmatrix.com`, `www.dulwich.io`), the inventory processor fails with: + +``` + +``` + +### Observed Behavior + +- ✅ **Detection/probing via httpx**: Successfully connects to sites (HEAD/GET for HTML) +- ❌ **Inventory download via urllib**: Fails SSL verification + +### Root Cause + +The certificate chains for these documentation sites include self-signed certificates. Different SSL handling between: +- **httpx** (used for detection): More lenient or different SSL context +- **urllib** (used in Sphinx inventory processor): Strict SSL verification against system CA bundle + +### Impact + +- **Structure processors** (including new Pydoctor processor) cannot be fully tested end-to-end with these sites +- **Inventory processor** cannot fetch inventory files from affected sites +- Does not affect sites with properly signed certificates + +### Affected Sites + +- https://docs.twistedmatrix.com/en/stable/api/ +- https://www.dulwich.io/api/ + +### Potential Solutions + +1. **Configure httpx-based inventory fetching** to use same client as detection +2. **Add SSL verification configuration** to allow disabling verification for specific domains (testing only) +3. **Report to site maintainers** about certificate chain issues +4. **Use different inventory sources** (manual creation, alternative processors) + +### Notes + +This issue was discovered during Pydoctor structure processor testing. The structure processor implementation is correct and works properly when inventory objects are available from other sources. + +--- + +## Code Duplication: normalize_base_url + +**Date Reported**: 2025-11-19 +**Component**: Structure processors (Sphinx, Pydoctor) +**Severity**: Low (technical debt) + +### Issue Description + +The `normalize_base_url` function is duplicated across structure processor packages: +- `sources/librovore/structures/sphinx/urls.py` +- `sources/librovore/structures/pydoctor/urls.py` + +### Current State + +Both implementations are identical and handle: +- URL parsing and normalization +- File path to URL conversion +- Scheme validation (http, https, file) +- Path cleanup (trailing slash removal) + +### Recommendation + +Extract `normalize_base_url` and related URL utilities to a shared location: +- Option 1: `sources/librovore/structures/urls.py` (common module) +- Option 2: `sources/librovore/urls.py` (top-level utility) +- Option 3: Include in base structure processor class + +### Benefits + +- Reduces code duplication +- Ensures consistent URL handling across all structure processors +- Simplifies maintenance and testing +- Reduces risk of divergence between implementations + +### Impact + +Low priority - current duplication is manageable with only two instances. Should be addressed before adding more structure processors to prevent further duplication. diff --git a/data/configuration/general.toml b/data/configuration/general.toml index 17c187e..1b1a89f 100644 --- a/data/configuration/general.toml +++ b/data/configuration/general.toml @@ -32,6 +32,10 @@ enabled = true name = "mkdocs" enabled = true +[[structure-extensions]] +name = "pydoctor" +enabled = true + # External Extension Examples # Uncomment and modify these examples to add external documentation processors. diff --git a/sources/librovore/structures/pydoctor/__.py b/sources/librovore/structures/pydoctor/__.py new file mode 100644 index 0000000..b69b3f4 --- /dev/null +++ b/sources/librovore/structures/pydoctor/__.py @@ -0,0 +1,26 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Pydoctor subpackage import namespace. ''' + +# ruff: noqa: F403 + + +from ..__ import * diff --git a/sources/librovore/structures/pydoctor/__init__.py b/sources/librovore/structures/pydoctor/__init__.py new file mode 100644 index 0000000..0a8dde3 --- /dev/null +++ b/sources/librovore/structures/pydoctor/__init__.py @@ -0,0 +1,33 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Pydoctor documentation source detector and processor. ''' + + +from .detection import PydoctorDetection +from .main import PydoctorProcessor + +from . import __ + + +def register( arguments: __.cabc.Mapping[ str, __.typx.Any ] ) -> None: + ''' Registers configured Pydoctor processor instance. ''' + processor = PydoctorProcessor( ) + __.structure_processors[ processor.name ] = processor diff --git a/sources/librovore/structures/pydoctor/conversion.py b/sources/librovore/structures/pydoctor/conversion.py new file mode 100644 index 0000000..788a314 --- /dev/null +++ b/sources/librovore/structures/pydoctor/conversion.py @@ -0,0 +1,84 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' HTML to markdown conversion utilities. ''' + + +from bs4 import BeautifulSoup as _BeautifulSoup + +from . import __ + + +class PydoctorMarkdownConverter( __.markdownify.MarkdownConverter ): + ''' Custom markdownify converter for Pydoctor HTML. ''' + + def convert_pre( + self, + el: __.typx.Any, + text: str, + convert_as_inline: bool, + ) -> str: + ''' Converts pre elements with Python code detection. ''' + if self.is_code_block( el ): + # Pydoctor code blocks are typically Python + code_text = el.get_text( ) + return f"\n```python\n{code_text}\n```\n" + return super( ).convert_pre( el, text, convert_as_inline ) + + def is_code_block( self, element: __.typx.Any ) -> bool: + ''' Determines if element is a code block. ''' + # Pydoctor uses
 for code blocks
+        return element.name == 'pre'
+
+
+def html_to_markdown( html_text: str ) -> str:
+    ''' Converts HTML text to markdown using Pydoctor-specific patterns. '''
+    if not html_text.strip( ): return ''
+    try: cleaned_html = _preprocess_pydoctor_html( html_text )
+    except Exception: return html_text
+    try:
+        converter = PydoctorMarkdownConverter(
+            heading_style = 'ATX',
+            strip = [ 'nav', 'header', 'footer', 'script' ],
+            escape_underscores = False,
+            escape_asterisks = False
+        )
+        markdown = converter.convert( cleaned_html )
+    except Exception: return html_text
+    return markdown.strip( )
+
+
+def _preprocess_pydoctor_html( html_text: str ) -> str:
+    ''' Preprocesses Pydoctor HTML before markdown conversion. '''
+    soup: __.typx.Any = _BeautifulSoup( html_text, 'lxml' )
+    # Remove navigation elements
+    for selector in [ '.navbar', '.sidebar', '.mainnavbar' ]:
+        for element in soup.select( selector ):
+            element.decompose( )
+    # Remove search elements
+    for selector in [ '#searchBox', '.search' ]:
+        for element in soup.select( selector ):
+            element.decompose( )
+    # Remove Bootstrap scaffolding that doesn't contribute to content
+    for selector in [ '.container', '.row', '.col-md-*' ]:
+        for element in soup.select( selector ):
+            # Unwrap instead of decompose to keep content
+            element.unwrap( )
+    return str( soup )
diff --git a/sources/librovore/structures/pydoctor/detection.py b/sources/librovore/structures/pydoctor/detection.py
new file mode 100644
index 0000000..90a1961
--- /dev/null
+++ b/sources/librovore/structures/pydoctor/detection.py
@@ -0,0 +1,105 @@
+# vim: set filetype=python fileencoding=utf-8:
+# -*- coding: utf-8 -*-
+
+#============================================================================#
+#                                                                            #
+#  Licensed under the Apache License, Version 2.0 (the "License");           #
+#  you may not use this file except in compliance with the License.          #
+#  You may obtain a copy of the License at                                   #
+#                                                                            #
+#      http://www.apache.org/licenses/LICENSE-2.0                            #
+#                                                                            #
+#  Unless required by applicable law or agreed to in writing, software       #
+#  distributed under the License is distributed on an "AS IS" BASIS,         #
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+#  See the License for the specific language governing permissions and       #
+#  limitations under the License.                                            #
+#                                                                            #
+#============================================================================#
+
+
+''' Pydoctor detection and metadata extraction. '''
+
+
+from urllib.parse import ParseResult as _Url
+
+from . import __
+from . import extraction as _extraction
+from . import urls as _urls
+
+
+_scribe = __.acquire_scribe( __name__ )
+
+
+class PydoctorDetection( __.StructureDetection ):
+    ''' Detection result for Pydoctor documentation sources. '''
+
+    source: str
+    normalized_source: str = ''
+
+    @classmethod
+    def get_capabilities( cls ) -> __.StructureProcessorCapabilities:
+        ''' Pydoctor processor capabilities. '''
+        return __.StructureProcessorCapabilities(
+            supported_inventory_types = frozenset( { 'pydoctor' } ),
+            content_extraction_features = frozenset( {
+                __.ContentExtractionFeatures.Signatures,
+                __.ContentExtractionFeatures.Descriptions,
+                __.ContentExtractionFeatures.CodeExamples,
+            } ),
+            confidence_by_inventory_type = __.immut.Dictionary( {
+                'pydoctor': 1.0
+            } )
+        )
+
+    @classmethod
+    async def from_source(
+        selfclass,
+        auxdata: __.ApplicationGlobals,
+        processor: __.Processor,
+        source: str,
+    ) -> __.typx.Self:
+        ''' Constructs detection from source location. '''
+        detection = await processor.detect( auxdata, source )
+        return __.typx.cast( __.typx.Self, detection )
+
+    async def extract_contents(
+        self,
+        auxdata: __.ApplicationGlobals,
+        source: str,
+        objects: __.cabc.Sequence[ __.InventoryObject ], /,
+    ) -> tuple[ __.ContentDocument, ... ]:
+        ''' Extracts documentation content for specified objects. '''
+        documents = await _extraction.extract_contents(
+            auxdata, source, objects )
+        return tuple( documents )
+
+
+async def detect_pydoctor(
+    auxdata: __.ApplicationGlobals, base_url: _Url
+) -> float:
+    ''' Detects if source is a Pydoctor documentation site. '''
+    confidence = 0.0
+    # Check for index.html
+    index_url = _urls.derive_index_url( base_url )
+    try:
+        html_content = await __.retrieve_url_as_text(
+            auxdata.content_cache,
+            index_url, duration_max = 10.0 )
+    except Exception as exc:
+        _scribe.debug( f"Detection failed for {base_url.geturl( )}: {exc}" )
+        return confidence
+    html_lower = html_content.lower( )
+    # Check for pydoctor meta tag (highest confidence)
+    if ' list[ __.ContentDocument ]:
+    ''' Extracts documentation content for specified objects. '''
+    if not objects: return [ ]
+    tasks = [
+        _extract_object_documentation( auxdata, source, obj )
+        for obj in objects ]
+    candidate_results = await __.asyncf.gather_async(
+        *tasks, return_exceptions = True )
+    results: list[ __.ContentDocument ] = [
+        result.value for result in candidate_results
+        if __.generics.is_value( result ) and result.value is not None ]
+    return results
+
+
+def parse_pydoctor_html(
+    content: str, qname: str
+) -> __.cabc.Mapping[ str, str ]:
+    ''' Parses Pydoctor HTML to extract documentation. '''
+    try: soup = _BeautifulSoup( content, 'lxml' )
+    except Exception as exc:
+        raise __.DocumentationParseFailure( qname, exc ) from exc
+    # Extract signature from various possible locations
+    signature = _extract_signature( soup, qname )
+    # Extract docstring content
+    docstring = _extract_docstring( soup )
+    description_parts: list[ str ] = [ ]
+    if signature:
+        description_parts.append( f"```python\n{signature}\n```" )
+    if docstring:
+        description_parts.append( docstring )
+    return {
+        'description': '\n\n'.join( description_parts ),
+        'object_name': qname,
+    }
+
+
+async def _extract_object_documentation(
+    auxdata: __.ApplicationGlobals,
+    location: str,
+    obj: __.InventoryObject,
+) -> __.ContentDocument | None:
+    ''' Extracts documentation for a single object. '''
+    base_url = _urls.normalize_base_url( location )
+    doc_url = _urls.derive_documentation_url( base_url, obj.uri )
+    try:
+        html_content = await __.retrieve_url_as_text(
+            auxdata.content_cache, doc_url )
+    except Exception as exc:
+        _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
+        return None
+    try:
+        parsed_content = parse_pydoctor_html( html_content, obj.name )
+    except Exception as exc:
+        _scribe.debug( "Failed to parse %s: %s", obj.name, exc )
+        return None
+    description = _conversion.html_to_markdown(
+        parsed_content[ 'description' ] )
+    content_id = __.produce_content_id( location, obj.name )
+    return __.ContentDocument(
+        inventory_object = obj,
+        content_id = content_id,
+        description = description,
+        documentation_url = doc_url.geturl( ) )
+
+
+def _extract_docstring( soup: __.typx.Any ) -> str:
+    ''' Extracts docstring from .docstring div. '''
+    docstring_div = soup.find( 'div', class_ = 'docstring' )
+    if not docstring_div: return ''
+    # Remove navigation elements
+    for nav in docstring_div.find_all( 'nav' ):
+        nav.decompose( )
+    return str( docstring_div )
+
+
+def _extract_signature( soup: __.typx.Any, qname: str ) -> str:
+    ''' Extracts signature from Pydoctor HTML. '''
+    # Try to find the signature in various locations
+    # 1. Look for thisobject in thingTitle (module/class name)
+    thisobject = soup.find( 'code', class_ = 'thisobject' )
+    if thisobject:
+        signature_text = thisobject.get_text( strip = True )
+        if signature_text:
+            return signature_text
+    # 2. Look for function header
+    function_header = soup.find( 'div', class_ = 'functionHeader' )
+    if function_header:
+        code = function_header.find( 'code' )
+        if code:
+            signature_text = code.get_text( strip = True )
+            if signature_text:
+                return signature_text
+    # 3. Look for code in thingTitle
+    thing_title = soup.find( class_ = 'thingTitle' )
+    if thing_title:
+        code = thing_title.find( 'code' )
+        if code:
+            signature_text = code.get_text( strip = True )
+            if signature_text:
+                return signature_text
+    # 4. Fallback to qualified name
+    return qname
diff --git a/sources/librovore/structures/pydoctor/main.py b/sources/librovore/structures/pydoctor/main.py
new file mode 100644
index 0000000..fefe58c
--- /dev/null
+++ b/sources/librovore/structures/pydoctor/main.py
@@ -0,0 +1,68 @@
+# vim: set filetype=python fileencoding=utf-8:
+# -*- coding: utf-8 -*-
+
+#============================================================================#
+#                                                                            #
+#  Licensed under the Apache License, Version 2.0 (the "License");           #
+#  you may not use this file except in compliance with the License.          #
+#  You may obtain a copy of the License at                                   #
+#                                                                            #
+#      http://www.apache.org/licenses/LICENSE-2.0                            #
+#                                                                            #
+#  Unless required by applicable law or agreed to in writing, software       #
+#  distributed under the License is distributed on an "AS IS" BASIS,         #
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+#  See the License for the specific language governing permissions and       #
+#  limitations under the License.                                            #
+#                                                                            #
+#============================================================================#
+
+
+''' Main Pydoctor processor implementation. '''
+
+
+from . import __
+from . import detection as _detection
+from . import urls as _urls
+
+
+_scribe = __.acquire_scribe( __name__ )
+
+
+class PydoctorProcessor( __.Processor ):
+    ''' Processor for Pydoctor documentation sources. '''
+
+    name: str = 'pydoctor'
+
+    @property
+    def capabilities( self ) -> __.ProcessorCapabilities:
+        ''' Returns Pydoctor processor capabilities. '''
+        return __.ProcessorCapabilities(
+            processor_name = 'pydoctor',
+            version = '1.0.0',
+            supported_filters = [ ],
+            results_limit_max = 100,
+            response_time_typical = 'fast',
+            notes = (
+                'Works with Pydoctor-generated '
+                'Python API documentation sites' ),
+        )
+
+    async def detect(
+        self, auxdata: __.ApplicationGlobals, source: str
+    ) -> __.StructureDetection:
+        ''' Detects if can process documentation from source. '''
+        try:
+            base_url = _urls.normalize_base_url( source )
+        except Exception:
+            return _detection.PydoctorDetection(
+                processor = self, confidence = 0.0, source = source )
+
+        confidence = await _detection.detect_pydoctor(
+            auxdata, base_url )
+
+        return _detection.PydoctorDetection(
+            processor = self,
+            confidence = confidence,
+            source = source,
+            normalized_source = base_url.geturl( ) )
diff --git a/sources/librovore/structures/pydoctor/urls.py b/sources/librovore/structures/pydoctor/urls.py
new file mode 100644
index 0000000..5683310
--- /dev/null
+++ b/sources/librovore/structures/pydoctor/urls.py
@@ -0,0 +1,62 @@
+# vim: set filetype=python fileencoding=utf-8:
+# -*- coding: utf-8 -*-
+
+#============================================================================#
+#                                                                            #
+#  Licensed under the Apache License, Version 2.0 (the "License");           #
+#  you may not use this file except in compliance with the License.          #
+#  You may obtain a copy of the License at                                   #
+#                                                                            #
+#      http://www.apache.org/licenses/LICENSE-2.0                            #
+#                                                                            #
+#  Unless required by applicable law or agreed to in writing, software       #
+#  distributed under the License is distributed on an "AS IS" BASIS,         #
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+#  See the License for the specific language governing permissions and       #
+#  limitations under the License.                                            #
+#                                                                            #
+#============================================================================#
+
+
+''' URL manipulation and normalization functions. '''
+
+
+import urllib.parse as _urlparse
+
+from urllib.parse import ParseResult as _Url
+
+from . import __
+
+
+def normalize_base_url( source: str ) -> _Url:
+    ''' Extracts clean base documentation URL from any source. '''
+    try: url = _urlparse.urlparse( source )
+    except Exception as exc:
+        raise __.InventoryUrlInvalidity( source ) from exc
+    match url.scheme:
+        case '':
+            path = __.Path( source )
+            if path.is_file( ) or ( not path.exists( ) and path.suffix ):
+                path = path.parent
+            url = _urlparse.urlparse( path.resolve( ).as_uri( ) )
+        case 'http' | 'https' | 'file': pass
+        case _: raise __.InventoryUrlInvalidity( source )
+    path = url.path.rstrip( '/' )
+    return _urlparse.ParseResult(
+        scheme = url.scheme, netloc = url.netloc, path = path,
+        params = '', query = '', fragment = '' )
+
+
+def derive_documentation_url(
+    base_url: _Url, object_uri: str
+) -> _Url:
+    ''' Derives documentation URL from base URL and object URI. '''
+    # Pydoctor URIs are already relative paths like "module/class.html"
+    new_path = f"{base_url.path}/{object_uri}"
+    return base_url._replace( path = new_path )
+
+
+def derive_index_url( base_url: _Url ) -> _Url:
+    ''' Derives index.html URL from base URL. '''
+    new_path = f"{base_url.path}/index.html"
+    return base_url._replace( path = new_path )