diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index da6d435..69de030 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,6 +5,8 @@ on: branches: [ main ] pull_request: workflow_dispatch: + schedule: + - cron: '0 2 * * *' concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/CHANGES.md b/CHANGES.md index af6ba8a..bf6a4e8 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,10 @@ # About CrateDB changelog ## Unreleased +- Outline: Improved `llms-txt`'s `get_doc_content` to fail on + resources with HTTP != 200 +- Outline: Fixed broken links to documentation +- CI: Started running software tests each night to catch regressions ## v0.0.8 - 2025-07-28 - Outline: Shrank llms-txt output to <200_000 input tokens diff --git a/src/cratedb_about/outline/cratedb-outline.yaml b/src/cratedb_about/outline/cratedb-outline.yaml index b67793d..04a56d4 100644 --- a/src/cratedb_about/outline/cratedb-outline.yaml +++ b/src/cratedb_about/outline/cratedb-outline.yaml @@ -48,8 +48,8 @@ data: - title: "CrateDB README" link: https://raw.githubusercontent.com/crate/crate/refs/heads/master/README.rst description: README about CrateDB. - - title: "CrateDB database" - link: https://cratedb.com/docs/guide/_sources/home/index.md.txt + - title: "Welcome to CrateDB" + link: https://cratedb.com/docs/guide/_sources/index.md.txt description: Benefits of CrateDB at a glance. # Reference docs @@ -124,7 +124,7 @@ data: # Data modeling - title: "Data modeling: Sequences" - link: https://cratedb.com/docs/guide/_sources/performance/inserts/sequences.rst.txt + link: https://cratedb.com/docs/guide/_sources/start/modelling/primary-key.md.txt description: About autogenerated sequences and PRIMARY KEY values in CrateDB. parents: [ guide ] - title: "Data modeling: Optimistic Concurrency Control" @@ -137,15 +137,15 @@ data: # Performance guidelines - title: "Guide: CrateDB sharding" - link: https://cratedb.com/docs/guide/_sources/performance/sharding.rst.txt + link: https://cratedb.com/docs/guide/_sources/performance/sharding.md.txt description: A best practice guide about sharding with CrateDB. parents: [ guide ] - title: "Guide: CrateDB query optimization" - link: https://cratedb.com/docs/guide/_sources/performance/optimization.rst.txt + link: https://cratedb.com/docs/guide/_sources/performance/optimization.md.txt description: Essential principles for optimizing queries in CrateDB while avoiding the most common pitfalls. parents: [ guide ] - title: "Guide: Design for scale" - link: https://cratedb.com/docs/guide/_sources/performance/scaling.rst.txt + link: https://cratedb.com/docs/guide/_sources/performance/scaling.md.txt description: | Critical design considerations to successfully scale CrateDB in large production environments to ensure performance and reliability as workloads grow. @@ -615,8 +615,8 @@ data: description: CrateDB’s features are available using plain SQL, and it is wire-protocol compatible to PostgreSQL. parents: [ feature ] - title: "Feature: Connectivity" - link: https://cratedb.com/docs/guide/_sources/feature/connectivity/index.md.txt - description: All CrateDB connectivity options at a glance. + link: https://cratedb.com/docs/guide/_sources/connect/index.md.txt + description: "All CrateDB connectivity options at a glance: Drivers, adapters, connectors, frameworks." parents: [ feature ] - title: "Feature: Document Store" link: https://cratedb.com/docs/guide/_sources/feature/document/index.md.txt @@ -733,7 +733,7 @@ data: # Generative AI - title: "LangChain and CrateDB" - link: https://raw.githubusercontent.com/crate/cratedb-examples/refs/heads/main/topic/machine-learning/llm-langchain/README.md + link: https://raw.githubusercontent.com/crate/cratedb-examples/refs/heads/main/topic/machine-learning/langchain/README.md description: Get started with LangChain and CrateDB. source: examples diff --git a/src/cratedb_about/outline/model.py b/src/cratedb_about/outline/model.py index 48b8f9f..29de1e2 100644 --- a/src/cratedb_about/outline/model.py +++ b/src/cratedb_about/outline/model.py @@ -100,6 +100,28 @@ def to_llms_txt(self, optional: bool = False) -> str: The string representation of the context in llms.txt format. """ + def get_doc_content(url): + """ + Fetch content from local file if in nbdev repo. + + Source: https://github.com/AnswerDotAI/llms-txt/blob/0.0.4/llms_txt/core.py#L74-L80 + Patched to invoke `raise_for_status()`. + :return: + """ + from urllib.parse import urlparse + + import httpx + from llms_txt.core import _get_config, _local_docs_pth + + if (cfg := _get_config()) and url.startswith(cfg.doc_host): + relative_path = urlparse(url).path.lstrip("/") + local_path = _local_docs_pth(cfg) / relative_path + if local_path.exists(): + return local_path.read_text() + response = httpx.get(url, follow_redirects=True) + response.raise_for_status() + return response.text + # Patch `llms_txt` package to use caching via Hishel. # https://hishel.com/ http_client = get_cache_client() @@ -107,11 +129,13 @@ def to_llms_txt(self, optional: bool = False) -> str: # Patch the client object. with mock.patch("llms_txt.core.httpx", client): # Import module lazily to relax dependency surface. - from llms_txt import create_ctx + import llms_txt + + llms_txt.core.get_doc_content = get_doc_content # Expand links and output in Markdown format. markdown = self.to_markdown() - ctx = create_ctx(markdown, optional=optional, n_workers=None) + ctx = llms_txt.create_ctx(markdown, optional=optional, n_workers=None) return str(ctx) def get_item_titles(self, section_name: t.Optional[str] = None) -> t.List[str]: