diff --git a/data/configuration/general.toml b/data/configuration/general.toml index 1b1a89f..c42f207 100644 --- a/data/configuration/general.toml +++ b/data/configuration/general.toml @@ -36,6 +36,10 @@ enabled = true name = "pydoctor" enabled = true +[[structure-extensions]] +name = "rustdoc" +enabled = true + # External Extension Examples # Uncomment and modify these examples to add external documentation processors. diff --git a/sources/librovore/structures/rustdoc/__.py b/sources/librovore/structures/rustdoc/__.py new file mode 100644 index 0000000..a90c8da --- /dev/null +++ b/sources/librovore/structures/rustdoc/__.py @@ -0,0 +1,26 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Rustdoc subpackage import namespace. ''' + +# ruff: noqa: F403 + + +from ..__ import * diff --git a/sources/librovore/structures/rustdoc/__init__.py b/sources/librovore/structures/rustdoc/__init__.py new file mode 100644 index 0000000..c9569c3 --- /dev/null +++ b/sources/librovore/structures/rustdoc/__init__.py @@ -0,0 +1,33 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Rustdoc documentation structure processor. ''' + + +from .detection import RustdocDetection +from .main import RustdocStructureProcessor + +from . import __ + + +def register( arguments: __.cabc.Mapping[ str, __.typx.Any ] ) -> None: + ''' Registers configured Rustdoc structure processor instance. ''' + processor = RustdocStructureProcessor( ) + __.structure_processors[ processor.name ] = processor diff --git a/sources/librovore/structures/rustdoc/conversion.py b/sources/librovore/structures/rustdoc/conversion.py new file mode 100644 index 0000000..1730b37 --- /dev/null +++ b/sources/librovore/structures/rustdoc/conversion.py @@ -0,0 +1,49 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' HTML to Markdown conversion for Rustdoc content. ''' + + +from markdownify import markdownify as _md + +from . import __ + + +def convert_to_markdown( html: str ) -> str: + ''' Converts Rustdoc HTML to Markdown format. ''' + if not html or not html.strip( ): + return '' + markdown = _md( + html, + heading_style = 'ATX', + code_language = 'rust', + strip = [ 'nav', 'aside', 'header', 'footer' ], + ) + return markdown.strip( ) + + +def extract_code_language( element: __.typx.Any ) -> str: + ''' Extracts code language from Rustdoc HTML element classes. ''' + classes = element.get( 'class', [ ] ) + if not classes: return 'rust' + for cls in classes: + if cls in ( 'rust', 'toml', 'text', 'console', 'sh', 'bash' ): + return cls + return 'rust' diff --git a/sources/librovore/structures/rustdoc/detection.py b/sources/librovore/structures/rustdoc/detection.py new file mode 100644 index 0000000..9dd74a6 --- /dev/null +++ b/sources/librovore/structures/rustdoc/detection.py @@ -0,0 +1,120 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Rustdoc documentation structure detection. ''' + + +from bs4 import BeautifulSoup as _BeautifulSoup + +from . import __ +from . import extraction as _extraction + + +_scribe = __.acquire_scribe( __name__ ) + + +class RustdocDetection( __.StructureDetection ): + ''' Detection result for Rustdoc documentation sources. ''' + + source: str + normalized_source: str = '' + rustdoc_version: __.typx.Optional[ str ] = None + + @classmethod + def get_capabilities( cls ) -> __.StructureProcessorCapabilities: + ''' Rustdoc processor capabilities based on structure analysis. ''' + return __.StructureProcessorCapabilities( + supported_inventory_types = frozenset( { 'rustdoc' } ), + content_extraction_features = frozenset( { + __.ContentExtractionFeatures.Signatures, + __.ContentExtractionFeatures.Descriptions, + __.ContentExtractionFeatures.CodeExamples, + __.ContentExtractionFeatures.CrossReferences, + } ), + confidence_by_inventory_type = __.immut.Dictionary( { + 'rustdoc': 0.95 + } ) + ) + + @classmethod + async def from_source( + selfclass, + auxdata: __.ApplicationGlobals, + processor: __.Processor, + source: str, + ) -> __.typx.Self: + ''' Constructs detection from source location. ''' + detection = await processor.detect( auxdata, source ) + return __.typx.cast( __.typx.Self, detection ) + + async def extract_contents( + self, + auxdata: __.ApplicationGlobals, + source: str, + objects: __.cabc.Sequence[ __.InventoryObject ], /, + ) -> tuple[ __.ContentDocument, ... ]: + ''' Extracts documentation content for specified objects. ''' + documents = await _extraction.extract_contents( + auxdata, source, objects ) + return tuple( documents ) + + +async def detect_rustdoc( + auxdata: __.ApplicationGlobals, source_url: __.typx.Any +) -> tuple[ bool, __.typx.Optional[ str ] ]: + ''' Detects if source is Rustdoc-generated documentation. ''' + try: + html_content = await __.retrieve_url_as_text( + auxdata.content_cache, source_url, duration_max = 10.0 ) + except __.DocumentationInaccessibility: + return False, None + if __.is_absent( html_content ): + return False, None + try: soup = _BeautifulSoup( html_content, 'lxml' ) + except Exception as exc: + _scribe.debug( f"HTML parsing failed for {source_url}: {exc}" ) + return False, None + is_rustdoc, version = detect_rustdoc_markers( soup ) + return is_rustdoc, version + + +def detect_rustdoc_markers( + soup: __.typx.Any +) -> tuple[ bool, __.typx.Optional[ str ] ]: + ''' Detects Rustdoc-specific HTML markers. ''' + rustdoc_version = None + meta_generator = soup.find( 'meta', attrs = { 'name': 'generator' } ) + if meta_generator: + content = meta_generator.get( 'content', '' ) + if 'rustdoc' in str( content ).lower( ): + return True, str( content ) + if soup.find( 'rustdoc-topbar' ): + version_attr = soup.find( attrs = { 'data-rustdoc-version': True } ) + if version_attr: + rustdoc_version = version_attr.get( 'data-rustdoc-version' ) + return True, rustdoc_version + if soup.find( attrs = { 'data-rustdoc-version': True } ): + version_attr = soup.find( attrs = { 'data-rustdoc-version': True } ) + rustdoc_version = version_attr.get( 'data-rustdoc-version' ) + return True, rustdoc_version + css_pattern = __.re.compile( r'rustdoc.*\.css' ) + if soup.find( 'link', href = css_pattern ): + return True, None + return False, None diff --git a/sources/librovore/structures/rustdoc/extraction.py b/sources/librovore/structures/rustdoc/extraction.py new file mode 100644 index 0000000..f61a7b3 --- /dev/null +++ b/sources/librovore/structures/rustdoc/extraction.py @@ -0,0 +1,185 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Documentation extraction and content retrieval for Rustdoc. ''' + + +from bs4 import BeautifulSoup as _BeautifulSoup + +from . import __ +from . import conversion as _conversion + + +_scribe = __.acquire_scribe( __name__ ) + + +async def extract_contents( + auxdata: __.ApplicationGlobals, + source: str, + objects: __.cabc.Sequence[ __.InventoryObject ], /, +) -> list[ __.ContentDocument ]: + ''' Extracts documentation content for specified objects. ''' + if not objects: return [ ] + tasks = [ + extract_object_documentation( auxdata, source, obj ) + for obj in objects ] + candidate_results = await __.asyncf.gather_async( + *tasks, return_exceptions = True ) + results: list[ __.ContentDocument ] = [ + result.value for result in candidate_results + if __.generics.is_value( result ) and result.value is not None ] + return results + + +async def extract_object_documentation( + auxdata: __.ApplicationGlobals, + source: str, + obj: __.InventoryObject, +) -> __.Absential[ __.ContentDocument ]: + ''' Extracts documentation for a single Rustdoc object. ''' + base_url = __.normalize_base_url( source ) + doc_path = obj.uri.lstrip( '/' ) + full_path = f"{base_url.path}/{doc_path}" + doc_url = base_url._replace( path = full_path ) + try: + html_content = await __.retrieve_url_as_text( + auxdata.content_cache, doc_url, duration_max = 10.0 ) + except __.DocumentationInaccessibility as exc: + _scribe.warning( f"Cannot retrieve {obj.uri}: {exc}" ) + return __.absent + if __.is_absent( html_content ): + _scribe.warning( f"Empty content from {obj.uri}." ) + return __.absent + try: + content_parts = parse_documentation_html( html_content, obj.uri ) + except Exception as exc: + _scribe.warning( f"Parse failure for {obj.uri}: {exc}" ) + return __.absent + markdown_content = _assemble_markdown_content( obj, content_parts ) + content_id = f"{obj.name}@{doc_url.geturl()}" + return __.ContentDocument( + inventory_object = obj, + content_id = content_id, + description = markdown_content, + documentation_url = doc_url.geturl( ), + extraction_metadata = __.immut.Dictionary( { + 'extraction_method': 'rustdoc_html_parsing', + 'relevance_score': 1.0, + 'match_reasons': [ 'direct extraction' ], + } ) + ) + + +def parse_documentation_html( + content: str, url: str +) -> dict[ str, str ]: + ''' Parses HTML content to extract documentation sections. ''' + try: soup = _BeautifulSoup( content, 'lxml' ) + except Exception as exc: + raise __.DocumentationParseFailure( url, exc ) from exc + main_content = soup.find( 'main' ) + if not main_content: + section_content = soup.find( 'section', id = 'main-content' ) + if not section_content: + raise __.DocumentationContentAbsence( url ) + main_content = section_content + cleanup_navigation_elements( main_content ) + item_decl = extract_item_declaration( main_content ) + docblocks = extract_docblocks( main_content ) + code_examples = extract_code_examples( main_content ) + return { + 'item_declaration': item_decl, + 'documentation': docblocks, + 'code_examples': code_examples, + } + + +def cleanup_navigation_elements( soup: __.typx.Any ) -> None: + ''' Removes navigation and UI elements from parsed HTML. ''' + cleanup_selectors = ( + 'nav.sidebar', + 'rustdoc-toolbar', + 'rustdoc-topbar', + '.sidebar-resizer', + '.src', + '.out-of-band', + ) + for selector in cleanup_selectors: + for element in soup.select( selector ): + element.decompose( ) + + +def extract_code_examples( soup: __.typx.Any ) -> str: + ''' Extracts code examples from documentation. ''' + examples: list[ str ] = [ ] + example_wraps = soup.find_all( 'div', class_ = 'example-wrap' ) + for wrap in example_wraps: + code_block = wrap.find( 'pre', class_ = 'rust' ) + if code_block: + code_text = code_block.get_text( strip = True ) + if code_text: + examples.append( f"```rust\n{code_text}\n```" ) + return '\n\n'.join( examples ) + + +def extract_docblocks( soup: __.typx.Any ) -> str: + ''' Extracts documentation blocks from main content. ''' + docblocks = soup.find_all( 'div', class_ = 'docblock' ) + if not docblocks: return '' + parts: list[ str ] = [ ] + for docblock in docblocks: + _remove_nested_code_examples( docblock ) + html_str = str( docblock ) + markdown = _conversion.convert_to_markdown( html_str ) + if markdown: + parts.append( markdown ) + return '\n\n'.join( parts ) + + +def extract_item_declaration( soup: __.typx.Any ) -> str: + ''' Extracts item type declaration from documentation. ''' + item_decl = soup.find( 'pre', class_ = 'rust item-decl' ) + if not item_decl: return '' + decl_text = item_decl.get_text( strip = True ) + if decl_text: + return f"```rust\n{decl_text}\n```" + return '' + + +def _assemble_markdown_content( + obj: __.InventoryObject, parts: dict[ str, str ] +) -> str: + ''' Assembles final Markdown content from extracted parts. ''' + sections: list[ str ] = [ ] + if parts.get( 'item_declaration' ): + sections.append( f"## Declaration\n\n{parts['item_declaration']}" ) + if parts.get( 'documentation' ): + sections.append( f"## Documentation\n\n{parts['documentation']}" ) + if parts.get( 'code_examples' ): + sections.append( f"## Examples\n\n{parts['code_examples']}" ) + if not sections: + return f"# {obj.display_name}\n\nNo documentation available." + return f"# {obj.display_name}\n\n" + '\n\n'.join( sections ) + + +def _remove_nested_code_examples( docblock: __.typx.Any ) -> None: + ''' Removes nested code examples to avoid duplication. ''' + for example_wrap in docblock.find_all( 'div', class_ = 'example-wrap' ): + example_wrap.decompose( ) diff --git a/sources/librovore/structures/rustdoc/main.py b/sources/librovore/structures/rustdoc/main.py new file mode 100644 index 0000000..8ae8fff --- /dev/null +++ b/sources/librovore/structures/rustdoc/main.py @@ -0,0 +1,73 @@ +# vim: set filetype=python fileencoding=utf-8: +# -*- coding: utf-8 -*- + +#============================================================================# +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +#============================================================================# + + +''' Main Rustdoc structure processor implementation. ''' + + +from . import __ +from . import detection as _detection + + +_scribe = __.acquire_scribe( __name__ ) + + +class RustdocStructureProcessor( __.Processor ): + ''' Processor for Rustdoc documentation structure. ''' + + name: str = 'rustdoc' + + @property + def capabilities( self ) -> __.ProcessorCapabilities: + ''' Returns Rustdoc processor capabilities. ''' + return __.ProcessorCapabilities( + processor_name = 'rustdoc', + version = '1.0.0', + supported_filters = [ + __.FilterCapability( + name = 'item_type', + description = ( + 'Rustdoc item type (struct, enum, trait, fn)' ), + type = 'string', + values = None, + ), + ], + results_limit_max = 100, + response_time_typical = 'fast', + notes = 'Works with Rustdoc-generated documentation sites', + ) + + async def detect( + self, auxdata: __.ApplicationGlobals, source: str + ) -> __.StructureDetection: + ''' Detects if can process documentation from source. ''' + try: base_url = __.normalize_base_url( source ) + except Exception: + return _detection.RustdocDetection( + processor = self, confidence = 0.0, source = source ) + normalized_url = base_url.geturl( ) + is_rustdoc, rustdoc_version = await _detection.detect_rustdoc( + auxdata, base_url ) + confidence = 0.95 if is_rustdoc else 0.0 + return _detection.RustdocDetection( + processor = self, + confidence = confidence, + source = source, + normalized_source = normalized_url, + rustdoc_version = rustdoc_version )