diff --git a/CHANGES.md b/CHANGES.md index e76e0ea..0cd2b8c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -20,6 +20,7 @@ - Query: Permitted loading context file from local filesystem per `ABOUT_CONTEXT_URL` - Query: Introduced caching for context payloads on HTTP remote URLs +- llms-txt: Introduced caching for expanding outline into Markdown file ## v0.0.3 - 2025-05-10 - Outline: Refactored the source of truth for the documentation outline diff --git a/pyproject.toml b/pyproject.toml index 1759df7..c84f25d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ dependencies = [ "colorlog<7", "hishel<0.2", "llms-txt==0.0.4", + "platformdirs<5", "pueblo==0.0.11", "requests<3", ] diff --git a/src/cratedb_about/cli.py b/src/cratedb_about/cli.py index 7798c3f..4f857b3 100644 --- a/src/cratedb_about/cli.py +++ b/src/cratedb_about/cli.py @@ -20,7 +20,12 @@ type=str, required=False, default=None, - help="URL to the outline file. By default, the built-in outline is used.", + metavar="URL|FILE", + help="Outline source. Provide either an HTTP(S) URL or a local file path. " + "If omitted, the built-in outline is used.", + callback=lambda _, __, v: v + if not v or v.startswith(("http://", "https://")) + else Path(v).expanduser().resolve(), ) diff --git a/src/cratedb_about/outline/model.py b/src/cratedb_about/outline/model.py index 602e851..681d719 100644 --- a/src/cratedb_about/outline/model.py +++ b/src/cratedb_about/outline/model.py @@ -1,10 +1,11 @@ import typing as t from io import StringIO +from unittest import mock from attr import Factory from attrs import define -from cratedb_about.util import DictTools, Dumpable, Metadata +from cratedb_about.util import DictTools, Dumpable, Metadata, get_cache_client @define @@ -89,12 +90,19 @@ def to_llms_txt(self, optional: bool = False) -> str: The string representation of the context in llms.txt format. """ - # Import module lazily to relax dependency surface. - from llms_txt import create_ctx - - markdown = self.to_markdown() - ctx = create_ctx(markdown, optional=optional, n_workers=None) - return str(ctx) + # Patch `llms_txt` package to use caching via Hishel. + # https://hishel.com/ + http_client = get_cache_client() + with http_client as client: + # Patch the client object. + with mock.patch("llms_txt.core.httpx", client): + # Import module lazily to relax dependency surface. + from llms_txt import create_ctx + + # Expand links and output in Markdown format. + markdown = self.to_markdown() + ctx = create_ctx(markdown, optional=optional, n_workers=None) + return str(ctx) def get_item_titles(self, section_name: t.Optional[str] = None) -> t.List[str]: """ diff --git a/src/cratedb_about/query/model.py b/src/cratedb_about/query/model.py index eee3d4c..c9d9ed8 100644 --- a/src/cratedb_about/query/model.py +++ b/src/cratedb_about/query/model.py @@ -3,7 +3,8 @@ import typing as t from pathlib import Path -import hishel +from cratedb_about.settings import settings +from cratedb_about.util import get_cache_client logger = logging.getLogger(__name__) @@ -54,13 +55,10 @@ class KnowledgeContextLoader: instructions = "You are a helpful and concise assistant." # Configure default cache lifetime to one hour. - default_cache_ttl: int = 3600 + default_cache_ttl: int = settings.http_cache_ttl def __init__(self): - # Configure Hishel, an httpx client with caching. - controller = hishel.Controller(allow_stale=True) - storage = hishel.SQLiteStorage(ttl=self.cache_ttl) - self.http_client = hishel.CacheClient(controller=controller, storage=storage, timeout=10.0) + self.http_client = get_cache_client(ttl=self.cache_ttl) @property def url(self) -> str: diff --git a/src/cratedb_about/settings.py b/src/cratedb_about/settings.py new file mode 100644 index 0000000..e1b95cb --- /dev/null +++ b/src/cratedb_about/settings.py @@ -0,0 +1,21 @@ +from pathlib import Path + +import platformdirs + + +class Settings: + """ + Application-wide settings bundle class. + """ + + http_timeout: float = 10.0 + http_cache_ttl: int = 3600 + + @property + def http_cache_path(self) -> Path: + path = platformdirs.user_cache_path(appname="cratedb-about") + path.mkdir(parents=True, exist_ok=True) + return path / ".hishel.sqlite" + + +settings = Settings() diff --git a/src/cratedb_about/util.py b/src/cratedb_about/util.py index 254a653..b94f8a4 100644 --- a/src/cratedb_about/util.py +++ b/src/cratedb_about/util.py @@ -1,11 +1,18 @@ +import logging +import sqlite3 import typing as t from collections import OrderedDict import attr +import hishel from attrs import define from cattrs.preconf.json import make_converter as make_json_converter from cattrs.preconf.pyyaml import make_converter as make_yaml_converter +from cratedb_about.settings import settings + +logger = logging.getLogger() + @define class Metadata: @@ -48,3 +55,28 @@ def from_json(cls, json_str: str): def from_yaml(cls, yaml_str: str): converter = make_yaml_converter(dict_factory=OrderedDict) return converter.loads(yaml_str, cls) + + +def get_cache_client(ttl: t.Optional[t.Union[int, float]] = settings.http_cache_ttl): + """ + Return the configured cache client. + https://hishel.com/ + """ + # Configure Hishel, a httpx client with caching. + logger.info(f"Configuring cache. ttl={ttl}, path={settings.http_cache_path}") + try: + controller = hishel.Controller(allow_stale=True) + storage = hishel.SQLiteStorage( + connection=sqlite3.connect(settings.http_cache_path, check_same_thread=False), + ttl=ttl, + ) + return hishel.CacheClient( + controller=controller, storage=storage, timeout=settings.http_timeout + ) + except Exception as e: + msg = ( + f"Failed to configure Hishel cache with SQLite. " + f"ttl={ttl}, path={settings.http_cache_path}. Reason: {e}" + ) + logger.exception(msg) + raise e.__class__(msg) from e diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..e0a39c9 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,21 @@ +import hishel +import pytest + +from cratedb_about.util import get_cache_client + + +def test_get_cache_client_valid(): + client = get_cache_client() + assert isinstance(client, hishel.CacheClient) + + +def test_get_cache_client_failure(mocker, caplog): + def _raise(*_args, **_kwargs): + raise Exception("Test error") + + mocker.patch.object(hishel.CacheClient, "__init__", _raise) + with pytest.raises(Exception) as excinfo: + get_cache_client() + assert excinfo.match("Test error") + assert excinfo.match("Failed to configure Hishel cache with SQLite") + assert "Failed to configure Hishel cache with SQLite" in caplog.text